From 32c9eff2fff8ee91a60c9410c69042dc4c1cc5c8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 6 Jan 2025 23:22:25 +0800
Subject: [PATCH 001/115] [Bugfix][V1] Fix molmo text-only inputs (#11676)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../vision_language/test_models.py            | 10 ++
 .../vision_language/vlm_utils/model_utils.py  | 99 ++++++++++++++++++-
 vllm/model_executor/models/molmo.py           | 56 ++++-------
 3 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index dc0b683c1f1cb..146685738a1d0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -341,6 +341,16 @@
         ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "molmo": VLMTestInfo(
+        models=["allenai/Molmo-7B-D-0924"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        image_size_factors=[(),(1.0, 1.0, 1.0)],
+        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        postprocess_inputs=model_utils.molmo_post_processor,
+    ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
     # https://github.com/huggingface/transformers/issues/34307
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3eca8fb9dcb1a..6c7a753af787e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -5,17 +5,20 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from PIL.Image import Image
-from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+                          GenerationConfig)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
+                           PromptImageInput, PromptVideoInput, _ImageAssets)
+from ....utils import TokensTextLogprobs
 from .types import RunnerOutput
 
 
@@ -222,6 +225,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
     return {"model_inputs": hf_inputs}
 
 
+def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
+    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
+    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
+
+
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
         tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
@@ -451,3 +459,88 @@ def _generate(self, *args, **kwargs):
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
 
     return hf_model
+
+
+def _generate_greedy_logprobs_limit(
+    self,
+    prompts: List[str],
+    max_tokens: int,
+    num_logprobs: int,
+    images: Optional[PromptImageInput] = None,
+    audios: Optional[PromptAudioInput] = None,
+    videos: Optional[PromptVideoInput] = None,
+    **kwargs: Any,
+) -> List[TokensTextLogprobs]:
+    all_inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+    # Process in batches for inference.
+    if len(all_inputs):
+        input_ids_lst = []
+        images_lst = []
+        images_input_idx_lst = []
+        imges_masks_lst = []
+        for inputs in all_inputs:
+            input_ids_lst.append(inputs["input_ids"])
+            images_lst.append(inputs["images"])
+            images_input_idx_lst.append(inputs["image_input_idx"])
+            imges_masks_lst.append(inputs["image_masks"])
+        batch_inputs = {}
+        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
+        batch_inputs['images'] = torch.cat(images_lst, dim=0)
+        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
+                                                    dim=0)
+        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
+
+        outputs = self.model.generate_from_batch(
+            batch=self.wrap_device(batch_inputs,
+                                   device=self.model.device.type),
+            generation_config=GenerationConfig(
+                max_new_tokens=max_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=False,
+            ),
+            tokenizer=self.tokenizer,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+        )
+
+    all_logprobs: List[List[Dict[int, float]]] = []
+    all_output_ids: List[List[int]] = []
+    all_output_strs: List[str] = []
+
+    for index in range(len(all_inputs)):
+        (
+            seq_logprobs_lst,
+            output_len,
+        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
+                                            num_logprobs)
+        all_logprobs.append(seq_logprobs_lst)
+        seq_ids = outputs.sequences[index]
+        output_ids = seq_ids[-output_len:]
+        all_output_ids.append(output_ids.tolist())
+        all_output_strs.append(self.tokenizer.decode(output_ids))
+    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+    return [(output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs]
+
+
+####### Molmo-specific HuggingFace runner patchers
+def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Molmo."""
+    hf_processor = hf_model.processor
+
+    def _processor(*args, **kwargs):
+        return hf_processor.process(*args, **kwargs)
+
+    hf_model.processor = _processor
+
+    setattr(  # noqa: B010
+        hf_model,
+        "generate_greedy_logprobs_limit",
+        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
+    )
+
+    return hf_model
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc25be9f5b6a9..0e8287bb56b6b 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1081,45 +1081,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     else:
         out = processor.process(None, image, tokens=inputs["prompt_token_ids"])
 
-    image_processor = processor.image_processor
-    max_total_crops = 1 + image_processor.max_crops
-    if image is not None:
-        images, image_input_idx, image_masks = pad_images(
-            max_total_crops,
-            out["images"],
-            out["image_input_idx"],
-            out.get("image_masks"),
-        )
-    else:
-        base_image_input_size = image_processor.base_image_input_size
-        image_patch_size = image_processor.image_patch_size
-        image_num_patch = (
-            base_image_input_size[0] // image_patch_size,
-            base_image_input_size[1] // image_patch_size,
-        )
-        n_pixels = image_patch_size * image_patch_size * 3
-        n_patches = image_num_patch[0] * image_num_patch[1]
-
-        image_length_w = image_processor.image_token_length_w
-        image_length_h = image_processor.image_token_length_h
-        tokens_per_image = image_length_w * image_length_h
-        images = torch.full(
-            (max_total_crops, n_patches, n_pixels),
-            -1,
-            dtype=torch.float32,
-        )
-        image_input_idx = torch.full(
-            (max_total_crops, tokens_per_image),
-            -1,
-            dtype=torch.int32,
+    # If there is no image, return directly.
+    if image is None:
+        new_prompt_token_ids = out["input_ids"].tolist()
+        prompt = inputs.get("prompt")
+        if prompt is None:
+            prompt = tokenizer.decode(new_prompt_token_ids)
+        return token_inputs(
+            prompt_token_ids=new_prompt_token_ids,
+            prompt=prompt,
         )
-        if image_processor.image_padding_mask:
-            image_masks = torch.full(
-                (max_total_crops, n_patches),
-                -1,
-                dtype=torch.float32,
-            )
 
+    image_processor = processor.image_processor
+    max_total_crops = 1 + image_processor.max_crops
+    images, image_input_idx, image_masks = pad_images(
+        max_total_crops,
+        out["images"],
+        out["image_input_idx"],
+        out.get("image_masks"),
+    )
     image_data = dict(
         images=images,
         image_input_idx=image_input_idx,
@@ -1143,11 +1123,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
                 offset = i
             size += 1
     image_data["image_start_end"] = (offset, offset + size)
-
     prompt = inputs.get("prompt")
     if prompt is None:
         prompt = tokenizer.decode(new_prompt_token_ids)
-
     return token_inputs(
         prompt_token_ids=new_prompt_token_ids,
         prompt=prompt,

From e20c92bb618384ce8d0013e0c9ad273d0c23d65b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 7 Jan 2025 00:11:28 +0800
Subject: [PATCH 002/115] [Kernel] Move attn_type to Attention.__init__()
 (#11690)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py  | 100 ++++++++++----------
 tests/kernels/utils.py                      |  12 ++-
 vllm/attention/backends/abstract.py         |   2 +-
 vllm/attention/backends/blocksparse_attn.py |  14 +--
 vllm/attention/backends/flash_attn.py       |   4 +-
 vllm/attention/backends/flashinfer.py       |  15 ++-
 vllm/attention/backends/hpu_attn.py         |  13 +--
 vllm/attention/backends/ipex_attn.py        |  12 +--
 vllm/attention/backends/pallas.py           |  13 +--
 vllm/attention/backends/rocm_flash_attn.py  |  14 +--
 vllm/attention/backends/torch_sdpa.py       |   4 +-
 vllm/attention/backends/xformers.py         |   6 +-
 vllm/attention/layer.py                     |  37 ++------
 vllm/model_executor/models/bart.py          |  44 +++------
 vllm/model_executor/models/bert.py          |  10 +-
 vllm/model_executor/models/mllama.py        |  11 +--
 vllm/model_executor/models/qwen2.py         |  35 ++++---
 vllm/v1/attention/backends/flash_attn.py    |  14 +--
 18 files changed, 159 insertions(+), 201 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index d943b048b7934..614674375786e 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -13,8 +13,7 @@
 import torch
 
 from tests.kernels.utils import *
-from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
-                            AttentionType)
+from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
@@ -64,6 +63,7 @@ class TestPoint(NamedTuple):
     max_dec_seq_len: int
     max_enc_seq_len: int
     num_blocks: int
+    attn_type: AttentionType
 
 
 class TestResources(NamedTuple):
@@ -96,7 +96,6 @@ class TestResources(NamedTuple):
     '''
 
     scale: float
-    attn_backend: AttentionBackend
     attn: Attention
     kv_cache: torch.Tensor
 
@@ -129,16 +128,17 @@ class that Attention will automatically select when it is constructed.
     '''
 
     scale = float(1.0 / (test_pt.head_size**0.5))
-    attn_backend = make_backend(test_pt.backend_name)
     attn = Attention(
         test_pt.num_heads,
         test_pt.head_size,
         scale=scale,
+        prefix=f"{test_pt.attn_type}",
+        attn_type=test_pt.attn_type,
     )
     if test_pt.num_blocks is None or test_pt.num_heads is None:
         # Caller does not require a KV cache
         return TestResources(
-            scale, attn_backend, attn,
+            scale, attn,
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
@@ -148,7 +148,7 @@ class that Attention will automatically select when it is constructed.
                              test_pt.block_size,
                              device=CUDA_DEVICE,
                              backend=test_pt.backend_name)
-    return TestResources(scale, attn_backend, attn, kv_cache)
+    return TestResources(scale, attn, kv_cache)
 
 
 def _encoder_attn_setup(
@@ -193,6 +193,7 @@ def _encoder_attn_setup(
         _,
         max_q_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -301,6 +302,7 @@ def _decoder_attn_setup(
         max_q_seq_len,
         _,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -488,6 +490,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
         max_decoder_seq_len,
         max_encoder_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -622,7 +625,6 @@ def _run_encoder_attention_test(
       & attn_metadata
     '''
     assert attn_metadata.num_decode_tokens == 0
-    attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
     with set_forward_context(attn_metadata, vllm_config):
@@ -635,14 +637,11 @@ def _run_encoder_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            torch.tensor([],
-                                         dtype=torch.float32,
-                                         device=packed_qkv.query.device),
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(
+            reshaped_query, packed_qkv.key, packed_qkv.value,
+            torch.tensor([],
+                         dtype=torch.float32,
+                         device=packed_qkv.query.device), attn_metadata)
 
 
 def _run_decoder_self_attention_test(
@@ -675,7 +674,6 @@ def _run_decoder_self_attention_test(
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
       & attn_metadata
     '''
-    attn_type = AttentionType.DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
@@ -690,12 +688,8 @@ def _run_decoder_self_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
+                            kv_cache, attn_metadata)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -742,7 +736,6 @@ def _run_encoder_decoder_cross_attention_test(
     '''
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
-    attn_type = AttentionType.ENCODER_DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     if cross_test_params is None:
@@ -762,12 +755,8 @@ def _run_encoder_decoder_cross_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            key,
-                            value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, key, value, kv_cache,
+                            attn_metadata)
 
 
 @pytest.fixture(autouse=True)
@@ -839,7 +828,7 @@ def test_encoder_only(
         # is not part of this test
         test_pt = TestPoint(num_heads, head_size, attn_backend.name,
                             batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+                            max_enc_seq_len, 4096, AttentionType.ENCODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
@@ -855,7 +844,7 @@ def test_encoder_only(
         # Shared prefill metadata structure
 
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             None,
             decoder_test_params=None,
@@ -961,20 +950,29 @@ def test_e2e_enc_dec_attn(
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
-        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                            batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.ENCODER)
+        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                    batch_size, block_size, max_dec_seq_len,
+                                    max_enc_seq_len, 4096,
+                                    AttentionType.ENCODER_DECODER)
+        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.DECODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
         vllm_config = VllmConfig()
         with set_current_vllm_config(vllm_config):
-            test_rsrcs = _make_test_resources(test_pt)
+            enc_test_rsrcs = _make_test_resources(enc_test_pt)
+            enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt)
+            dec_test_rsrcs = _make_test_resources(dec_test_pt)
 
         # Construct encoder attention test params (only used
         # during prefill)
 
-        enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+        enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs)
 
         # Construct Decoder self-attention prefill-phase & decode-phase
         # test params, including query/key/value tensors, decoder self-attention
@@ -987,7 +985,7 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             decphase_dec_test_params,
             cross_block_base_addr,
-        ) = _decoder_attn_setup(test_pt, test_rsrcs)
+        ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs)
 
         # Construct encoder/decoder cross-attention prefill-phase
         # & decode-phase test params, including key/value tensors,
@@ -1000,14 +998,14 @@ def test_e2e_enc_dec_attn(
             dec_qkv,
             enc_test_params,
             prephase_dec_test_params,
-            test_pt,
-            test_rsrcs,
+            enc_dec_test_pt,
+            enc_dec_test_rsrcs,
             block_base_addr=cross_block_base_addr)
 
         # Shared prefill metadata structure
         assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
             decoder_test_params=prephase_dec_test_params,
@@ -1017,10 +1015,10 @@ def test_e2e_enc_dec_attn(
 
         # PREFILL: encoder attention
 
-        enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
+        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
                                                        enc_test_params,
                                                        prephase_attn_metadata,
-                                                       test_pt=test_pt,
+                                                       test_pt=enc_test_pt,
                                                        vllm_config=vllm_config)
 
         # - Is encoder attention result correct?
@@ -1030,10 +1028,10 @@ def test_e2e_enc_dec_attn(
         # PREFILL: decoder self-attention test
 
         prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill decoder self-attention correct?
@@ -1044,11 +1042,11 @@ def test_e2e_enc_dec_attn(
         # PREFILL: encoder/decoder cross-attention test
 
         prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_cross_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill encoder/decoder cross-attention correct?
@@ -1059,7 +1057,7 @@ def test_e2e_enc_dec_attn(
         # DECODE: build decode-phase attention metadata
 
         decphase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             False,
             dec_qkv.q_seq_lens,
             decoder_test_params=decphase_dec_test_params,
@@ -1070,10 +1068,10 @@ def test_e2e_enc_dec_attn(
         # DECODE: decoder self-attention test
 
         decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             decphase_dec_test_params,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase decoder self-attention correct?
@@ -1084,11 +1082,11 @@ def test_e2e_enc_dec_attn(
         # DECODE: encoder/decoder cross-attention test
 
         decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             decphase_dec_test_params,
             None,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index e7865fb2500ef..848eea7f54cab 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                         STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
 
@@ -790,7 +791,7 @@ def make_block_tables_slot_mapping(
 
 
 def make_test_metadata(
-    attn_backend: AttentionBackend,
+    attn_backend: _Backend,
     is_prompt: bool,
     seq_lens: Optional[List[int]],
     decoder_test_params: Optional[PhaseTestParameters],
@@ -815,7 +816,7 @@ def make_test_metadata(
 
     Arguments:
 
-    * attn_backend: Backend for sourcing attention kernels
+    * attn_backend_name: Backend for sourcing attention kernels
     * is_prompt: prefill if True, o/w decode
     * seq_lens: list of token counts for each sequence
     * decoder_test_params: decoder self-attention test params; 
@@ -882,6 +883,8 @@ def make_test_metadata(
         #   (kv_mmap)
         cross_kv_mmap = cross_test_params.kv_mmap
 
+    attn_backend_obj = make_backend(attn_backend.name)
+
     if is_prompt:
         # Prefill-phase scenario
 
@@ -902,8 +905,7 @@ def make_test_metadata(
                                    context_lens,
                                    encoder_seq_lens,
                                    device=device)
-
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
             multi_modal_placeholder_index_maps=None,
@@ -952,7 +954,7 @@ def make_test_metadata(
                                    encoder_seq_lens,
                                    device=device)
 
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
             multi_modal_placeholder_index_maps=None,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index aed04361e5fb4..f5dcaea79af93 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -233,6 +233,7 @@ def __init__(
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         raise NotImplementedError
 
@@ -246,7 +247,6 @@ def forward(
         attn_metadata: T,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 99cb84346d84e..7089d59392c36 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -300,6 +300,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
@@ -350,6 +351,12 @@ def __init__(
             active_head_range=self.blocksparse_params.active_head_range,
         )
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "BlocksparseFlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -359,7 +366,6 @@ def forward(
         attn_metadata: BlocksparseFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -375,12 +381,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "BlocksparseFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c69e12ad78c44..23ea244f07dfe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -600,6 +600,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -627,6 +628,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -637,7 +639,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -659,6 +660,7 @@ def forward(
 
         assert output is not None, "Output tensor must be provided."
 
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e367468d05d26..a11462b2068a5 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -748,6 +748,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -764,6 +765,12 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -773,18 +780,10 @@ def forward(
         attn_metadata: FlashInferMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         # TODO: directly write to output tensor
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
         num_heads: int = self.num_heads
         head_size: int = self.head_size
         num_kv_heads: int = self.num_kv_heads
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index f90d15d4207e7..94a461e0c8c29 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -102,6 +102,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super(AttentionImpl, self).__init__()
         self.kv_cache_dtype = kv_cache_dtype
@@ -143,6 +144,12 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -152,7 +159,6 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -166,11 +172,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "HPUAttentionImpl")
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 21949874bea47..da1d307daa517 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -115,6 +115,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -146,6 +147,11 @@ def __init__(
             raise NotImplementedError(
                 "IPEX backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "IpexAttnBackendImpl")
 
     def split_kv_cache(
         self,
@@ -172,7 +178,6 @@ def forward(
         attn_metadata: IpexAttnMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -189,11 +194,6 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "IpexAttnBackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 9809aed0e66f9..2ac492dd8ae54 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -100,6 +100,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -141,6 +142,12 @@ def __init__(
                 # megacore mode will be None.
                 self.megacore_mode = "batch"
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -150,7 +157,6 @@ def forward(
         attn_metadata: PallasMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -168,11 +174,6 @@ def forward(
             shape = [batch_size, seq_len, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index d43c15b661ef7..a91a5af5c3d58 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -338,6 +338,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -397,6 +398,12 @@ def __init__(
                 self.attn_func = _sdpa_attention
                 logger.debug("Using naive attention in ROCmBackend")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmFlashAttentionImpl")
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -414,7 +421,6 @@ def forward(
         attn_metadata: ROCmFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -432,12 +438,6 @@ def forward(
         """
         # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 0cff6f5952aba..c14f7754596dd 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -390,6 +390,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -421,6 +422,7 @@ def __init__(
             raise NotImplementedError(
                 "Torch SDPA backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -431,7 +433,6 @@ def forward(
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -448,6 +449,7 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 3e59b3603d2c6..694c7cc1bc36a 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -379,6 +379,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -405,6 +406,8 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        self.attn_type = attn_type
+
     def forward(
         self,
         query: torch.Tensor,
@@ -414,7 +417,6 @@ def forward(
         attn_metadata: "XFormersMetadata",
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -468,7 +470,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-
+        attn_type = self.attn_type
         # Check that appropriate attention metadata attributes are
         # selected for the desired attention type
         if (attn_type == AttentionType.ENCODER
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 69b6d1e4648df..f1b3598e60b54 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -41,6 +41,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
         prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -96,7 +97,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap)
+                             blocksparse_params, logits_soft_cap, attn_type)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -119,6 +120,7 @@ def __init__(
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -127,18 +129,12 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
 
         if self.use_direct_call:
-            return self.impl.forward(query,
-                                     key,
-                                     value,
-                                     kv_cache,
-                                     attn_metadata,
-                                     self._k_scale,
-                                     self._v_scale,
-                                     attn_type=attn_type)
+            return self.impl.forward(query, key, value, kv_cache,
+                                     attn_metadata, self._k_scale,
+                                     self._v_scale)
         elif self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -152,13 +148,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, attn_type,
-                self.layer_name)
+                query, key, value, output, kv_cache, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, attn_type,
-                                                    self.layer_name)
+                                                    kv_cache, self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -237,20 +231,13 @@ def unified_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.dynamic_forward_context
     self = forward_context.static_forward_context[layer_name]
-    return self.impl.forward(query,
-                             key,
-                             value,
-                             kv_cache,
-                             attn_metadata,
-                             self._k_scale,
-                             self._v_scale,
-                             attn_type=attn_type)
+    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+                             self._k_scale, self._v_scale)
 
 
 def unified_attention_fake(
@@ -258,7 +245,6 @@ def unified_attention_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -279,7 +265,6 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
@@ -292,7 +277,6 @@ def unified_attention_with_output(
                       attn_metadata,
                       self._k_scale,
                       self._v_scale,
-                      attn_type=attn_type,
                       output=output)
 
 
@@ -302,7 +286,6 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     return
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 3776490cb3465..57eb5adc82d5b 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -71,12 +71,8 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
     def forward(
         self,
         positions: torch.Tensor,
-        attn_type: AttentionType,
     ) -> torch.Tensor:
         """`input_ids' shape is expected to be [bsz x seqlen]."""
-
-        assert attn_type != AttentionType.ENCODER_DECODER
-
         return super().forward(positions + self.offset)
 
 
@@ -180,7 +176,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -189,12 +186,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -264,7 +256,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.DECODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -273,12 +266,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -348,7 +336,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_DECODER)
 
     def forward(
         self,
@@ -372,12 +361,7 @@ def forward(
             _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
                                     dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -644,10 +628,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         # retrieve input_ids and inputs_embeds
         inputs_embeds = self.embed_tokens(input_ids)
 
-        embed_pos = self.embed_positions(
-            positions,
-            AttentionType.ENCODER,
-        )
+        embed_pos = self.embed_positions(positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
@@ -734,10 +715,7 @@ def forward(self, decoder_input_ids: torch.Tensor,
         inputs_embeds = self.embed_tokens(decoder_input_ids)
 
         # embed positions
-        embed_pos = self.embed_positions(
-            decoder_positions,
-            AttentionType.DECODER,
-        )
+        embed_pos = self.embed_positions(decoder_positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c1d47b1bc9bcd..4be136543de15 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -238,7 +238,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
 
     def forward(
         self,
@@ -248,12 +249,7 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        output = self.attn(q,
-                           k,
-                           v,
-                           kv_cache,
-                           attn_metadata,
-                           attn_type=AttentionType.ENCODER_ONLY)
+        output = self.attn(q, k, v, kv_cache, attn_metadata)
         return output
 
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 6536f9807730c..c5046e06edecb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -770,6 +770,7 @@ def __init__(
             self.scaling,
             self.num_local_key_value_heads,
             prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def forward(
@@ -805,13 +806,9 @@ def forward(
                                                kv_range_for_decode,
                                                attn_metadata)
         else:
-            output = self.attn(q.view(-1,
-                                      self.num_local_heads * self.head_dim),
-                               k,
-                               v,
-                               kv_cache,
-                               attn_metadata,
-                               attn_type=AttentionType.ENCODER_DECODER)
+            output = self.attn(
+                q.view(-1, self.num_local_heads * self.head_dim), k, v,
+                kv_cache, attn_metadata)
         out, _ = self.o_proj(output)
         return out
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 88f4ea4352726..01745b5fd53e1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -107,7 +107,8 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  rope_scaling: Optional[Tuple] = None,
-                 prefix: str = "") -> None:
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
         super().__init__()
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -160,7 +161,8 @@ def __init__(self,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
 
     def forward(
         self,
@@ -168,17 +170,11 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -197,6 +193,16 @@ def __init__(
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -207,6 +213,7 @@ def __init__(
             quant_config=quant_config,
             rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
         )
         self.mlp = Qwen2MLP(
             hidden_size=self.hidden_size,
@@ -220,15 +227,6 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
-        # By default, Qwen2 uses causal attention as it is a decoder-only model.
-        # You can override the HF config with `is_causal=False` to enable
-        # bidirectional attention, which is used in some embedding models
-        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
-        if getattr(config, "is_causal", True):
-            self._attn_type = AttentionType.DECODER
-        else:
-            self._attn_type = AttentionType.ENCODER_ONLY
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -249,7 +247,6 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
-            attn_type=self._attn_type,
         )
 
         # Fully Connected
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 65002f1ad70c7..b02bc9ffde538 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -89,6 +89,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -119,6 +120,12 @@ def __init__(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -128,7 +135,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -142,12 +148,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")

From 91b361ae898c944f823534121613f9d3dc19d7d1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:58:16 -0800
Subject: [PATCH 003/115] [V1] Extend beyond image modality and support
 mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 tests/multimodal/test_utils.py                | 209 +++++++++++++++++-
 tests/v1/core/test_kv_cache_utils.py          |  18 +-
 tests/v1/core/test_prefix_caching.py          |  17 +-
 vllm/model_executor/models/interfaces.py      |   6 +-
 vllm/model_executor/models/llava_onevision.py |  65 +++---
 vllm/model_executor/models/molmo.py           |   3 -
 vllm/multimodal/__init__.py                   |   3 +
 vllm/multimodal/hasher.py                     | 100 +++++++++
 vllm/multimodal/inputs.py                     |   9 +-
 vllm/multimodal/processing.py                 |  92 +++-----
 vllm/multimodal/utils.py                      |  86 ++++++-
 vllm/v1/engine/__init__.py                    |  18 +-
 vllm/v1/engine/mm_input_mapper.py             |  67 ------
 vllm/v1/engine/processor.py                   | 101 ++++++---
 vllm/v1/request.py                            |  48 ++--
 vllm/v1/worker/gpu_model_runner.py            |  74 ++++---
 17 files changed, 636 insertions(+), 282 deletions(-)
 create mode 100644 vllm/multimodal/hasher.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5a2778026192a..94a8849f7edcd 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `MiniCPMV`
   - MiniCPM-V
   - T + I<sup>E+</sup>
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 6029f2e514772..198344e5bd88c 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -2,16 +2,22 @@
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import Dict, Tuple
+from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
 
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
+                                   merge_and_sort_multimodal_metadata,
                                    repeat_and_pad_placeholder_tokens)
 
+if TYPE_CHECKING:
+    from vllm.multimodal.hasher import MultiModalHashDict
+    from vllm.multimodal.inputs import MultiModalPlaceholderDict
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
         assert ranges == expected_ranges
+
+
+# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+class TestCase(NamedTuple):
+    mm_positions: "MultiModalPlaceholderDict"
+    mm_hashes: Optional["MultiModalHashDict"]
+    expected_modalities: list[str]
+    expected_ranges: list[PlaceholderRange]
+    expected_hashes: Optional[list[str]]
+
+
+def test_merge_and_sort_multimodal_metadata():
+
+    test_cases = [
+        # Single modality should return result as is but flattened
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2),
+                ]
+            },
+            mm_hashes={"image": ["hash1", "hash2"]},
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=2),
+            ],
+            expected_hashes=["hash1", "hash2"],
+        ),
+
+        # Single modality without hashes return None for mm hash.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=2),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Multiple modalities with hashes should return sorted modalities
+        # and flattened ranges and hashes.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=[
+                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            ],
+        ),
+
+        # Multiple modalities without hashes should return sorted modalities
+        # and flattened ranges and None.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Three modalities
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            expected_modalities=["audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            expected_hashes=[
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+
+
+def test_merge_and_sort_multimodal_metadata_with_interleaving():
+
+    test_cases = [
+
+        # <image> <audio> <image> <audio>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+
+        # <image> <image> <video> <audio> <image>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+    ]
+
+    for case in test_cases:
+        with pytest.raises(ValueError) as ex_info:
+            merge_and_sort_multimodal_metadata(case.mm_positions,
+                                               case.mm_hashes)
+
+        assert "Interleaved mixed-modality" in str(ex_info.value)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 2ed70b42991b5..f4081766e39a2 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -14,14 +14,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            multi_modal_placeholders={"image": mm_positions}
-            if mm_positions else None,
-            multi_modal_hashes=mm_hashes,
-        ),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 35e3a2f972720..b97f55b8c6535 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,8 +1,7 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.inputs import token_inputs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
@@ -13,12 +12,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
-                            multi_modal_placeholders={"image": mm_positions}
-                            if mm_positions else None,
-                            multi_modal_hashes=mm_hashes),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 70b78fe64f2d8..6f26603046483 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,8 +39,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input image.
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
+
+        NOTE: The returned multimodal embeddings must be in the same order as 
+        the appearances of their corresponding multimodal data item in the 
+        input prompt.
         """
         ...
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5eac2f223d794..9117824995021 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -35,6 +35,9 @@
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
@@ -223,8 +226,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_image_tokens = self._get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        return max(max_frames_per_video, 1)
 
     def _get_max_video_tokens(self, seq_len: int) -> int:
         target_width, target_height = self._get_image_size_with_most_features()
@@ -558,13 +563,15 @@ def _parse_and_validate_video_input(
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
-        if "pixel_values" in kwargs:
-            modalities["images"] = self._parse_and_validate_image_input(
-                **kwargs)
-
-        if "pixel_values_videos" in kwargs:
-            modalities["videos"] = self._parse_and_validate_video_input(
-                **kwargs)
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key == "pixel_values" and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
 
         return modalities
 
@@ -824,21 +831,21 @@ def get_multimodal_embeddings(
         if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
-
-        if "images" in modalities:
-            image_input = modalities["images"]
-            vision_embeddings = self._process_image_input(image_input)
-            multimodal_embeddings.append((vision_embeddings, "image"))
-        if "videos" in modalities:
-            video_input = modalities["videos"]
-            video_embeddings = self._process_video_pixels(video_input)
-            multimodal_embeddings.append((video_embeddings, "video"))
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_pixels(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
@@ -850,15 +857,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.image_token_index)
-                if modality == "video":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.video_token_index)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_index, self.config.video_token_index])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 0e8287bb56b6b..2e60bc719f096 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -972,8 +972,6 @@ def image_input_mapper_for_molmo(
         assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
 
-    # Remove unused dummy PIL image
-    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -1019,7 +1017,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
-        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index e58bbe81717a0..343b9322ecc5e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
@@ -18,6 +19,8 @@
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
new file mode 100644
index 0000000000000..24aa1ca658048
--- /dev/null
+++ b/vllm/multimodal/hasher.py
@@ -0,0 +1,100 @@
+import pickle
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.inputs import TokensPrompt
+
+logger = init_logger(__name__)
+
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
+
+class MultiModalHasher:
+
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image.Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+
+        return mm_hashes
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b0a1104546186..8fdcc4b524035 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,8 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -14,6 +14,9 @@
 
 from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+
 _T = TypeVar("_T")
 
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
@@ -513,7 +516,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[list[str]]
+    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 933c1d3aff0cb..41113cd85bd16 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,3 @@
-import pickle
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -7,18 +6,16 @@
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
-import numpy as np
-import torch
-from blake3 import blake3
-from PIL import Image
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
+from vllm import envs
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
+from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
@@ -486,56 +483,6 @@ def _maybe_log_cache_stats(self) -> None:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
                          cache_stats.hit_ratio)
 
-    def _serialize_item(self, obj: object) -> bytes:
-        # Simple cases
-        if isinstance(obj, str):
-            return obj.encode("utf-8")
-        if isinstance(obj, bytes):
-            return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
-
-        # Convertible to NumPy arrays
-        if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
-        if isinstance(obj, (int, float)):
-            obj = np.array(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tobytes()
-
-        logger.warning(
-            "No serialization method found for %s. "
-            "Falling back to pickle.", type(obj))
-
-        return pickle.dumps(obj)
-
-    def _item_to_bytes(
-        self,
-        key: str,
-        obj: object,
-    ) -> Iterable[tuple[bytes, bytes]]:
-        # Recursive cases
-        if isinstance(obj, (list, tuple)):
-            for i, elem in enumerate(obj):
-                yield from self._item_to_bytes(f"{key}.{i}", elem)
-        elif isinstance(obj, dict):
-            for k, v in obj.items():
-                yield from self._item_to_bytes(f"{key}.{k}", v)
-        else:
-            key_bytes = self._serialize_item(key)
-            value_bytes = self._serialize_item(obj)
-            yield key_bytes, value_bytes
-
-    def _hash_kwargs(self, **kwargs: object) -> str:
-        hasher = blake3()
-
-        for k, v in kwargs.items():
-            for k_bytes, v_bytes in self._item_to_bytes(k, v):
-                hasher.update(k_bytes)
-                hasher.update(v_bytes)
-
-        return hasher.hexdigest()
-
     def get(
         self,
         model_id: str,
@@ -554,9 +501,9 @@ def get(
         """
         self._maybe_log_cache_stats()
 
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         return self._cache.get(cache_key)
 
     def put(
@@ -571,9 +518,9 @@ def put(
         Put a processed multi-modal item into the cache
         according to its dependencies (see :meth:`get`).
         """
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         self._cache.put(cache_key, output_kwargs)
 
 
@@ -1049,6 +996,24 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
+        # Create MM hashes (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+
+        if envs.VLLM_USE_V1:
+            model_id = self.ctx.model_config.model
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
             mm_items,
@@ -1122,6 +1087,7 @@ def apply(
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
@@ -1174,7 +1140,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "tokens.")
 
         total_len = len(prompt_token_ids)
-        if total_len > seq_len:
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
             logger.warning(
                 "The context length (%d) of the model is too short "
                 "to hold the multi-modal embeddings in the worst case "
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7b6ded6a27084..f4a514ba55d0c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -25,6 +25,10 @@
 
 _M = TypeVar("_M")
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalPlaceholderDict
+
 
 class MediaConnector:
 
@@ -437,3 +441,83 @@ def consecutive_placeholder_ranges(
         PlaceholderRange(offset=initial_offset + i * item_size,
                          length=item_size) for i in range(num_items)
     ]
+
+
+def merge_and_sort_multimodal_metadata(
+    mm_positions: "MultiModalPlaceholderDict",
+    mm_hashes: Optional["MultiModalHashDict"],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of 
+    PlaceholderRange, sorted by their offset (starting index in the input 
+    sequence) in the ascending order.
+
+    Optionally if a MultiModalHashDict is given, same operation will be 
+    applied to the object and the sorted list of hashes will be returned.
+
+    Raises:
+        ValueError: If the input prompt has interleaved placeholders from
+            different modalities (e.g, "<image><audio><image> Describe the 
+            content.")
+    
+    Returns:
+        list[str]: Sorted list of involved modalities.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
+            mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
+            given, None otherwise.
+    """
+
+    modalities = list(mm_positions.keys())
+
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
+    if len(modalities) == 1:
+        if mm_hashes is None:
+            return modalities, list(mm_positions[modalities[0]]), None
+        else:
+            return modalities, list(mm_positions[modalities[0]]), list(
+                mm_hashes[modalities[0]])
+
+    placeholder_lists_with_modality = [(modality, mm_positions[modality])
+                                       for modality in modalities]
+
+    if mm_hashes is None:
+        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
+                                          key=lambda x: x[1][0]['offset'])
+        sorted_hash_lists = None
+    else:
+        hashes_lists = [
+            mm_hashes[modality] for modality in modalities
+            if modality in mm_hashes
+        ]
+        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
+                                  hashes_lists),
+                              key=lambda x: x[0][1][0]['offset'])
+        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
+        sorted_placeholder_lists = list(sorted_placeholder_tuple)
+        sorted_hash_lists = list(sorted_hash_tuple)
+
+    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+
+    # Flatten sorted list of lists to a single list and verify there is no
+    # interleaving of placeholders from different modalities.
+    merged_placeholders: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_placeholder_lists:
+        if merged_placeholders and placeholder_list[0][
+                'offset'] < merged_placeholders[-1]['offset']:
+            raise ValueError(
+                "Interleaved mixed-modality inference is currently not "
+                "supported.")
+        merged_placeholders.extend(placeholder_list)
+
+    if sorted_hash_lists is not None:
+        merged_hashes = []
+        for hash_list in sorted_hash_lists:
+            merged_hashes.extend(hash_list)
+    else:
+        merged_hashes = None
+
+    return sorted_modalities, merged_placeholders, merged_hashes
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f70464fc88298..5e3c5e327ef63 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,12 +1,14 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import msgspec
 
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import SamplingParams
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
 
 
 @dataclass
@@ -21,13 +23,13 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
     mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[MultiModalPlaceholderDict]
-    sampling_params: SamplingParams
+    mm_placeholders: Optional[List["PlaceholderRange"]]
+    sampling_params: "SamplingParams"
     eos_token_id: Optional[int]
     arrival_time: float
-    lora_request: Optional[LoRARequest]
+    lora_request: Optional["LoRARequest"]
 
 
 class EngineCoreOutput(
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 8bfc739b3dbbc..d83460a40ad26 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,10 +1,6 @@
 from typing import Any, Dict, List, Optional
 
-import PIL
-from blake3 import blake3
-
 from vllm.config import ModelConfig
-from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
@@ -144,66 +140,3 @@ def process_inputs(
             full_mm_inputs.append(mm_input)
 
         return full_mm_inputs
-
-
-class MMHasher:
-
-    def __init__(self):
-        pass
-
-    def hash_dummy_mm_data(
-            self,
-            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
-        """Hash user-defined dummy multimodal data used for profiling."""
-
-        if mm_data is None:
-            return None
-
-        image_inputs = mm_data['image']
-
-        # This is a temporary workaround for models (e.g, Molmo) that
-        # process multimodal data in the input processor (therefore
-        # image_inputs is MultiModalKwargs instead of raw input format).
-        # `raw_mm_data` with the original input format is expected
-        # in this case.
-        if isinstance(image_inputs, dict):
-            assert "raw_mm_data" in image_inputs and isinstance(
-                image_inputs["raw_mm_data"], PIL.Image.Image)
-            image_inputs = image_inputs.pop("raw_mm_data")
-
-        return self.hash_images(image_inputs)
-
-    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
-        """Hash multimodal data in the user input prompt if they exist."""
-
-        if "multi_modal_data" not in prompt:
-            return None
-
-        mm_data = prompt["multi_modal_data"]
-        if not mm_data:
-            # mm_data can be None or an empty dict.
-            return None
-
-        image_inputs = mm_data["image"]
-
-        return self.hash_images(image_inputs)
-
-    def hash_images(self, image_inputs) -> Optional[List[str]]:
-        """Hash PIL image objects to strings."""
-        if not isinstance(image_inputs, list):
-            image_inputs = [image_inputs]
-        assert len(image_inputs) > 0
-
-        ret = []
-        for image in image_inputs:
-            assert isinstance(image, PIL.Image.Image)
-
-            # Convert image to bytes
-            bytes = image.tobytes()
-
-            # Hash image bytes
-            hasher = blake3()
-            hasher.update(bytes)
-            ret.append(hasher.hexdigest())
-
-        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c0f6cfab4865c..43419d2ff5381 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,14 +7,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
+                             MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 
 
 class Processor:
@@ -47,7 +48,6 @@ def __init__(
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
-        self.mm_hasher = MMHasher()
 
     def process_inputs(
         self,
@@ -73,11 +73,6 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Compute MM hashes (if enabled)
-        mm_hashes = None
-        if self.use_hash:
-            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
-
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -108,8 +103,20 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
+        # Multimodal related.
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            # Use mm_hashes from processed inputs if the model has merged
+            # input processor.
+            if decoder_inputs.multi_modal_hashes:
+                mm_hashes = decoder_inputs.multi_modal_hashes
+            # Fallback to using MultiModalHasher directly.
+            else:
+                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+
         # For merged preprocessor, mm_data is already mm_inputs
-        precomputed_mm_inputs = None
+        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
         decoder_mm_data = decoder_inputs.multi_modal_data
         if isinstance(decoder_mm_data, MultiModalKwargs):
             # The output of merged multi-modal processor (`decoder_mm_data`)
@@ -122,27 +129,67 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-        # Apply MM mapper
-        mm_inputs = None
-        if len(decoder_mm_data) > 0:
-            mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_mm_data,
+        mm_positions = decoder_inputs.multi_modal_placeholders
+
+        # Last-mile processing of multimodal metadata and inputs.
+        if mm_positions:
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            # NOTE: interleaved modalities are not supported.
+            (
+                sorted_modalities,
+                sorted_mm_positions,
+                sorted_mm_hashes,
+            ) = merge_and_sort_multimodal_metadata(
+                mm_positions,
                 mm_hashes,
-                decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs,
             )
 
+            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
+            # modalities involved AND the model supports merged input processor.
+            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
+
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input has only one
+                # modality key.
+                for mm_input in precomputed_mm_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                precomputed_mm_inputs = sorted(
+                    precomputed_mm_inputs,
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
+
+            # Apply mm input cache update (and input mapper if necessary).
+            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+                mm_data=decoder_mm_data,
+                mm_hashes=sorted_mm_hashes,
+                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs=precomputed_mm_inputs,
+            )
+        else:
+            sorted_mm_inputs = None
+            sorted_mm_hashes = None
+            sorted_mm_positions = None
+
         return EngineCoreRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            mm_inputs,
-            mm_hashes,
-            decoder_inputs.multi_modal_placeholders,
-            sampling_params,
-            eos_token_id,
-            arrival_time,
-            lora_request,
+            request_id=request_id,
+            prompt=decoder_inputs.prompt,
+            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            mm_inputs=sorted_mm_inputs,
+            mm_hashes=sorted_mm_hashes,
+            mm_placeholders=sorted_mm_positions,
+            sampling_params=sampling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
         )
 
     def _validate_model_inputs(self, inputs: ProcessorInputs):
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f4783ae366ef0..45450165eaefe 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,15 +1,15 @@
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
-from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_utils import BlockHashType
 
 
@@ -18,14 +18,17 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        inputs: DecoderOnlyInputs,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
+        multi_modal_hashes: Optional[List[str]],
+        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.inputs = SingletonInputsAdapter(inputs)
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -41,26 +44,21 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = self.inputs.prompt
-        self.prompt_token_ids = self.inputs.prompt_token_ids
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Multi-modal input metadata.
-        mm_positions = self.inputs.multi_modal_placeholders
-        if mm_positions:
-            # FIXME(woosuk): Support other modalities.
-            self.mm_positions = mm_positions.get("image", [])
-        else:
-            self.mm_positions = []
-        # Output of the mm input mapper (e.g., image tensors).
-        self.mm_inputs: List[MultiModalKwargs] = []
-        if self.inputs.multi_modal_inputs:
-            self.mm_inputs = self.inputs.multi_modal_inputs
+        # Multi-modal related
+        self.mm_positions = multi_modal_placeholders or []
+        self.mm_inputs = multi_modal_inputs or []
+        self.mm_hashes: List[str] = multi_modal_hashes or []
 
-        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+        # Sanity check
+        assert len(self.mm_inputs) == len(self.mm_positions)
+        assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
@@ -70,15 +68,11 @@ def __init__(
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
             request_id=request.request_id,
-            inputs=token_inputs(
-                prompt_token_ids=request.prompt_token_ids,
-                prompt=request.prompt,
-                multi_modal_data=None,
-                multi_modal_inputs=request.mm_inputs,
-                multi_modal_hashes=request.mm_hashes,
-                multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=None,
-            ),
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            multi_modal_inputs=request.mm_inputs,
+            multi_modal_hashes=request.mm_hashes,
+            multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 31e693235f99f..a1d4f9b135789 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -82,12 +82,10 @@ def __init__(
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
 
-        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
-        # profiling.
-        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
-        self.mm_hasher = MMHasher()
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        # NOTE: Initialized input mapper is only used for processing dummy
+        # multimodal data into multimodal kwargs for GPU memory profiling.
+        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        self.mm_input_mapper_profiling.use_cache = False
 
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
@@ -722,8 +720,6 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        # TODO (ywang96): generalize this beyond image modality since
-        # mm_input_mapper only supports image inputs.
         if self.is_multimodal_model:
 
             # Create dummy batch of multimodal inputs.
@@ -735,15 +731,30 @@ def profile_run(self) -> None:
             dummy_mm_data = dummy_request_data.multi_modal_data
 
             # NOTE: Currently model is profiled with a single non-text
-            # modality even when it supports multiple.
-            max_tokens_per_mm_item = max(
-                self.mm_registry.get_max_tokens_per_item_by_modality(
-                    self.model_config).values())
-
-            max_num_mm_items_encoder_budget = min(
-                self.max_num_encoder_input_tokens,
-                self.encoder_cache_size) // max_tokens_per_mm_item
-
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+                self.model_config)
+
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder cache budget.
+            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
+                                       self.encoder_cache_size)
+            max_num_mm_items_encoder_budget = encoder_cache_budget // \
+                max_tokens_per_mm_item
+
+            # TODO: Allow users to set encoder_cache_budget in case this
+            # happens.
+            assert max_num_mm_items_encoder_budget > 0, (
+                f"Encoder cache budget={encoder_cache_budget} is too small to "
+                f"support the maximum possible size of multimodal embeddings"
+                f"={max_tokens_per_mm_item}.")
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
             max_mm_items_per_req = max(
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
@@ -763,33 +774,24 @@ def profile_run(self) -> None:
             # they are scheduled to be processed separately.
 
             # Case when models have a merged processor, their dummy data is
-            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
-            # and take the first item in each batched tensor.
-            # TODO (ywang96): This is somewhat hacky. Refactor this to be
-            # consistent with the other case.
+            # already batched `MultiModalKwargs`, therefore we take the first
+            # `MultiModalKwargsItem` from the desired modality to profile on.
             if isinstance(dummy_mm_data, MultiModalKwargs):
-                dummy_mm_kwargs = {
-                    k: v[0].unsqueeze(0)
-                    for k, v in dummy_mm_data.items()
-                }
+                dummy_mm_item = dummy_mm_data.get_item(
+                    modality=dummy_data_modality, item_index=0)
+                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
             # Case when models have dummy data explicitly defined as
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
+            # TODO (ywang96): deprecate this path once merged processor is
+            # supported on all models.
             else:
-                # Compute MM hashes (if enabled)
-                mm_hashes = None
-                if self.use_hash:
-                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
-                        dummy_mm_data)
-
-                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
-                    mm_hashes=mm_hashes,
+                    mm_hashes=None,
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
-
-                # Take the first `MultiModalKwargs`
                 dummy_mm_kwargs = mm_kwargs_list[0]
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(

From 08fb75c72e39dcb4f0751dc59583b95bda4d3656 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 09:10:54 +0800
Subject: [PATCH 004/115] [Bugfix] Fix LLaVA-NeXT feature size precision error
 (for real) (#11772)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  3 +-
 .../processing/test_llava_onevision.py        |  3 +-
 vllm/model_executor/models/llava_next.py      | 39 ++++++++-------
 vllm/model_executor/models/llava_onevision.py | 47 ++++++++++---------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 6c8d300717de4..37a6d334ee60c 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -17,7 +17,8 @@ def processor_for_llava_next():
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_next,
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index 71adde6568a17..ed3e2db799be7 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -18,7 +18,8 @@ def processor_for_llava_onevision():
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_onevision,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c76ec164a3087..258352416d4a7 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -121,30 +121,29 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
-
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
+
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 9117824995021..62dae74e377be 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -107,36 +107,37 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
 
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
-        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        ratio = math.sqrt(current_height_f * current_width_f /
+                          (9 * npatches**2))
         if ratio > 1.1:
-            unpadded_features = int(current_height // ratio) * int(
-                current_width // ratio)
-            newline_features = int(current_height // ratio)
+            height_factor = int(current_height_f // ratio)
+            width_factor = int(current_width_f // ratio)
+            unpadded_features = height_factor * width_factor
+            newline_features = height_factor
 
         return (unpadded_features, newline_features)
 

From d0169e1b0fa44a80ba40baf92dd2cedd3611076b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:05:17 +0800
Subject: [PATCH 005/115] [Model] Future-proof Qwen2-Audio multi-modal
 processor (#11776)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_audio.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index a7bb3425ed17c..576b01776e5de 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -227,12 +227,14 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves.
+        # Qwen2-Audio processor will start inserting placeholder tokens
+        # in an upcoming release:
+        # https://github.com/huggingface/transformers/pull/35534
         # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return True
+        return not hasattr(self._get_hf_processor(), "audio_token")
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)

From d93d2d74fd807a091add17c2065ee8869339f76a Mon Sep 17 00:00:00 2001
From: YiSheng5 <yi.sheng@intel.com>
Date: Tue, 7 Jan 2025 11:09:58 +0800
Subject: [PATCH 006/115] [XPU] Make pp group initilized for
 pipeline-parallelism (#11648)

Signed-off-by: yisheng <yi.sheng@intel.com>
---
 vllm/worker/xpu_worker.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 1295666055b04..e9cb623c8eb45 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -11,6 +11,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
@@ -176,3 +177,8 @@ def init_worker_distributed_environment(self) -> None:
             parallel_config.pipeline_parallel_size)
         # global all_reduce needed for overall oneccl warm up
         torch.distributed.all_reduce(torch.zeros(1).xpu())
+
+        if parallel_config.pipeline_parallel_size > 1:
+            # Add pp group init to avoid
+            # p2p communication as the first call
+            get_pp_group().all_reduce(torch.zeros(1).xpu())

From 8ceffbf3152d3b26d293ba1e157d0c187884572b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:20:01 +0800
Subject: [PATCH 007/115] [Doc][3/N] Reorganize Serving section (#11766)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 README.md                                     |   2 +-
 .../architecture_helm_deployment.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 .../source/contributing/model/registration.md |   4 +-
 .../docker.md}                                |   4 +-
 .../frameworks/bentoml.md}                    |   4 +-
 .../frameworks/cerebrium.md}                  |   4 +-
 .../frameworks/dstack.md}                     |   4 +-
 .../frameworks/helm.md}                       |   6 +-
 docs/source/deployment/frameworks/index.md    |  13 +++
 .../frameworks/lws.md}                        |   4 +-
 .../frameworks/skypilot.md}                   |   8 +-
 .../frameworks/triton.md}                     |   4 +-
 docs/source/deployment/integrations/index.md  |   9 ++
 .../integrations/kserve.md}                   |   4 +-
 .../integrations/kubeai.md}                   |   4 +-
 .../integrations/llamastack.md}               |   4 +-
 .../k8s.md}                                   |   4 +-
 .../nginx.md}                                 |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 docs/source/features/disagg_prefill.md        |   8 +-
 docs/source/features/spec_decode.md           |   2 +-
 .../getting_started/installation/gpu-rocm.md  |   2 +-
 .../getting_started/installation/hpu-gaudi.md |   2 +-
 docs/source/getting_started/quickstart.md     |  18 ++--
 docs/source/index.md                          |  49 ++++++-----
 docs/source/models/extensions/index.md        |   8 ++
 .../extensions}/runai_model_streamer.md       |   2 +-
 .../extensions}/tensorizer.md                 |   2 +-
 docs/source/models/supported_models.md        |  44 +++++-----
 docs/source/serving/distributed_serving.md    |  12 +--
 docs/source/serving/integrations.md           |  17 ----
 docs/source/serving/integrations/index.md     |   8 ++
 .../langchain.md}                             |   8 +-
 .../llamaindex.md}                            |   8 +-
 docs/source/serving/metrics.md                |   2 +-
 .../multimodal_inputs.md                      |  12 +--
 docs/source/serving/offline_inference.md      |  79 ++++++++++++++++++
 .../serving/openai_compatible_server.md       |   8 +-
 docs/source/serving/usage_stats.md            |   2 +-
 40 files changed, 248 insertions(+), 133 deletions(-)
 rename docs/source/{serving => assets/deployment}/architecture_helm_deployment.png (100%)
 rename docs/source/{serving/deploying_with_docker.md => deployment/docker.md} (98%)
 rename docs/source/{serving/deploying_with_bentoml.md => deployment/frameworks/bentoml.md} (89%)
 rename docs/source/{serving/deploying_with_cerebrium.md => deployment/frameworks/cerebrium.md} (98%)
 rename docs/source/{serving/deploying_with_dstack.md => deployment/frameworks/dstack.md} (98%)
 rename docs/source/{serving/deploying_with_helm.md => deployment/frameworks/helm.md} (98%)
 create mode 100644 docs/source/deployment/frameworks/index.md
 rename docs/source/{serving/deploying_with_lws.md => deployment/frameworks/lws.md} (91%)
 rename docs/source/{serving/run_on_sky.md => deployment/frameworks/skypilot.md} (98%)
 rename docs/source/{serving/deploying_with_triton.md => deployment/frameworks/triton.md} (87%)
 create mode 100644 docs/source/deployment/integrations/index.md
 rename docs/source/{serving/deploying_with_kserve.md => deployment/integrations/kserve.md} (85%)
 rename docs/source/{serving/deploying_with_kubeai.md => deployment/integrations/kubeai.md} (93%)
 rename docs/source/{serving/serving_with_llamastack.md => deployment/integrations/llamastack.md} (95%)
 rename docs/source/{serving/deploying_with_k8s.md => deployment/k8s.md} (99%)
 rename docs/source/{serving/deploying_with_nginx.md => deployment/nginx.md} (99%)
 create mode 100644 docs/source/models/extensions/index.md
 rename docs/source/{serving => models/extensions}/runai_model_streamer.md (98%)
 rename docs/source/{serving => models/extensions}/tensorizer.md (95%)
 delete mode 100644 docs/source/serving/integrations.md
 create mode 100644 docs/source/serving/integrations/index.md
 rename docs/source/serving/{serving_with_langchain.md => integrations/langchain.md} (82%)
 rename docs/source/serving/{serving_with_llamaindex.md => integrations/llamaindex.md} (74%)
 rename docs/source/{features => serving}/multimodal_inputs.md (95%)
 create mode 100644 docs/source/serving/offline_inference.md

diff --git a/README.md b/README.md
index f83c9d759b359..652268ec29cac 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ pip install vllm
 Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
 - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 
 ## Contributing
 
diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png
similarity index 100%
rename from docs/source/serving/architecture_helm_deployment.png
rename to docs/source/assets/deployment/architecture_helm_deployment.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 7ffec83333d7d..38ea956ba8dfb 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
 We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index cf1cdb0c9de0f..fe5aa94c52896 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -3,7 +3,7 @@
 # Model Registration
 
 vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page.
+A list of pre-registered architectures can be found [here](#supported-models).
 
 If your model is not on this list, you must register it to vLLM.
 This page provides detailed instructions on how to do so.
@@ -16,7 +16,7 @@ This gives you the ability to modify the codebase and test your model.
 After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
-Finally, update the [Supported Models](#supported-models) documentation page to promote your model!
+Finally, update our [list of supported models](#supported-models) to promote your model!
 
 ```{important}
 The list of models in each section should be maintained in alphabetical order.
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/deployment/docker.md
similarity index 98%
rename from docs/source/serving/deploying_with_docker.md
rename to docs/source/deployment/docker.md
index 844bd27800c7a..2df1aca27f1e6 100644
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/deployment/docker.md
@@ -1,6 +1,6 @@
-(deploying-with-docker)=
+(deployment-docker)=
 
-# Deploying with Docker
+# Using Docker
 
 ## Use vLLM's Official Docker Image
 
diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/deployment/frameworks/bentoml.md
similarity index 89%
rename from docs/source/serving/deploying_with_bentoml.md
rename to docs/source/deployment/frameworks/bentoml.md
index dfa0de4f0f6d7..ea0b5d1d4c93b 100644
--- a/docs/source/serving/deploying_with_bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -1,6 +1,6 @@
-(deploying-with-bentoml)=
+(deployment-bentoml)=
 
-# Deploying with BentoML
+# BentoML
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
similarity index 98%
rename from docs/source/serving/deploying_with_cerebrium.md
rename to docs/source/deployment/frameworks/cerebrium.md
index 950064c8c1b10..be018dfb75d7a 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -1,6 +1,6 @@
-(deploying-with-cerebrium)=
+(deployment-cerebrium)=
 
-# Deploying with Cerebrium
+# Cerebrium
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/deployment/frameworks/dstack.md
similarity index 98%
rename from docs/source/serving/deploying_with_dstack.md
rename to docs/source/deployment/frameworks/dstack.md
index 381f5f786ca2c..4142c1d9f1f60 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -1,6 +1,6 @@
-(deploying-with-dstack)=
+(deployment-dstack)=
 
-# Deploying with dstack
+# dstack
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/deployment/frameworks/helm.md
similarity index 98%
rename from docs/source/serving/deploying_with_helm.md
rename to docs/source/deployment/frameworks/helm.md
index 7286a0a88968f..18ed293191468 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -1,6 +1,6 @@
-(deploying-with-helm)=
+(deployment-helm)=
 
-# Deploying with Helm
+# Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
@@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} architecture_helm_deployment.png
+```{image} /assets/deployment/architecture_helm_deployment.png
 ```
 
 ## Values
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
new file mode 100644
index 0000000000000..6a59131d36618
--- /dev/null
+++ b/docs/source/deployment/frameworks/index.md
@@ -0,0 +1,13 @@
+# Using other frameworks
+
+```{toctree}
+:maxdepth: 1
+
+bentoml
+cerebrium
+dstack
+helm
+lws
+skypilot
+triton
+```
diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/deployment/frameworks/lws.md
similarity index 91%
rename from docs/source/serving/deploying_with_lws.md
rename to docs/source/deployment/frameworks/lws.md
index 22bab419eaca3..349fa83fbcb9d 100644
--- a/docs/source/serving/deploying_with_lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@@ -1,6 +1,6 @@
-(deploying-with-lws)=
+(deployment-lws)=
 
-# Deploying with LWS
+# LWS
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/source/serving/run_on_sky.md b/docs/source/deployment/frameworks/skypilot.md
similarity index 98%
rename from docs/source/serving/run_on_sky.md
rename to docs/source/deployment/frameworks/skypilot.md
index 115873ae49292..f02a943026922 100644
--- a/docs/source/serving/run_on_sky.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -1,6 +1,6 @@
-(on-cloud)=
+(deployment-skypilot)=
 
-# Deploying and scaling up with SkyPilot
+# SkyPilot
 
 ```{raw} html
 <p align="center">
@@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
 
 ## Prerequisites
 
-- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
-- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+- Check that `sky check` shows clouds or Kubernetes are enabled.
 
 ```console
 pip install skypilot-nightly
diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/deployment/frameworks/triton.md
similarity index 87%
rename from docs/source/serving/deploying_with_triton.md
rename to docs/source/deployment/frameworks/triton.md
index 9b0a6f1d54ae8..94d87120159c6 100644
--- a/docs/source/serving/deploying_with_triton.md
+++ b/docs/source/deployment/frameworks/triton.md
@@ -1,5 +1,5 @@
-(deploying-with-triton)=
+(deployment-triton)=
 
-# Deploying with NVIDIA Triton
+# NVIDIA Triton
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
new file mode 100644
index 0000000000000..d47ede8967547
--- /dev/null
+++ b/docs/source/deployment/integrations/index.md
@@ -0,0 +1,9 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+kserve
+kubeai
+llamastack
+```
diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/deployment/integrations/kserve.md
similarity index 85%
rename from docs/source/serving/deploying_with_kserve.md
rename to docs/source/deployment/integrations/kserve.md
index feaeb5d0ec8a2..c780fd74e8f55 100644
--- a/docs/source/serving/deploying_with_kserve.md
+++ b/docs/source/deployment/integrations/kserve.md
@@ -1,6 +1,6 @@
-(deploying-with-kserve)=
+(deployment-kserve)=
 
-# Deploying with KServe
+# KServe
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/deployment/integrations/kubeai.md
similarity index 93%
rename from docs/source/serving/deploying_with_kubeai.md
rename to docs/source/deployment/integrations/kubeai.md
index 3609d7e05acd3..2f5772e075d87 100644
--- a/docs/source/serving/deploying_with_kubeai.md
+++ b/docs/source/deployment/integrations/kubeai.md
@@ -1,6 +1,6 @@
-(deploying-with-kubeai)=
+(deployment-kubeai)=
 
-# Deploying with KubeAI
+# KubeAI
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/deployment/integrations/llamastack.md
similarity index 95%
rename from docs/source/serving/serving_with_llamastack.md
rename to docs/source/deployment/integrations/llamastack.md
index 71dadca7ad47c..474d2bdfa9580 100644
--- a/docs/source/serving/serving_with_llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@@ -1,6 +1,6 @@
-(run-on-llamastack)=
+(deployment-llamastack)=
 
-# Serving with Llama Stack
+# Llama Stack
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/deployment/k8s.md
similarity index 99%
rename from docs/source/serving/deploying_with_k8s.md
rename to docs/source/deployment/k8s.md
index 5f9b0e4f55ecc..760214e112fba 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -1,6 +1,6 @@
-(deploying-with-k8s)=
+(deployment-k8s)=
 
-# Deploying with Kubernetes
+# Using Kubernetes
 
 Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
 
diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/deployment/nginx.md
similarity index 99%
rename from docs/source/serving/deploying_with_nginx.md
rename to docs/source/deployment/nginx.md
index a1f00d8536465..a58f791c2997b 100644
--- a/docs/source/serving/deploying_with_nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -1,6 +1,6 @@
 (nginxloadbalancer)=
 
-# Deploying with Nginx Loadbalancer
+# Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 2f1280c047672..5e0dd021ad02e 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
-### OpenAI-compatible API server
+### OpenAI-Compatible API Server
 
 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 05226f2dec87c..645dc60807dd3 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -1,8 +1,12 @@
 (disagg-prefill)=
 
-# Disaggregated prefilling (experimental)
+# Disaggregated Prefilling (experimental)
 
-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+This page introduces you the disaggregated prefilling feature in vLLM.
+
+```{note}
+This feature is experimental and subject to change.
+```
 
 ## Why disaggregated prefilling?
 
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 8c52c97a41e48..bc8a0aa14dc5a 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -1,6 +1,6 @@
 (spec-decode)=
 
-# Speculative decoding
+# Speculative Decoding
 
 ```{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
index 796911d7305a6..e36b92513e31d 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 $ python3 setup.py develop
 ```
 
-This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
 ```{tip}
 - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 94de169f51a73..1d50cef3bdc83 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -82,7 +82,7 @@ $ python setup.py develop
 
 ## Supported Features
 
-- [Offline batched inference](#offline-batched-inference)
+- [Offline inference](#offline-inference)
 - Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ff216f8af30f9..3f9556165ece4 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -2,20 +2,20 @@
 
 # Quickstart
 
-This guide will help you quickly get started with vLLM to:
+This guide will help you quickly get started with vLLM to perform:
 
-- [Run offline batched inference](#offline-batched-inference)
-- [Run OpenAI-compatible inference](#openai-compatible-server)
+- [Offline batched inference](#quickstart-offline)
+- [Online inference using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
 - OS: Linux
 - Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 ## Installation
 
-You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
+It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
 $ conda create -n myenv python=3.10 -y
@@ -23,9 +23,11 @@ $ conda activate myenv
 $ pip install vllm
 ```
 
-Please refer to the [installation documentation](#installation-index) for more details on installing vLLM.
+```{note}
+For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
+```
 
-(offline-batched-inference)=
+(quickstart-offline)=
 
 ## Offline Batched Inference
 
@@ -73,7 +75,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-(openai-compatible-server)=
+(quickstart-online)=
 
 ## OpenAI-Compatible Server
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 4bc40bf0f5e41..c335155bd6e14 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -65,32 +65,14 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
-```{toctree}
-:caption: Serving
-:maxdepth: 1
-
-serving/openai_compatible_server
-serving/deploying_with_docker
-serving/deploying_with_k8s
-serving/deploying_with_helm
-serving/deploying_with_nginx
-serving/distributed_serving
-serving/metrics
-serving/integrations
-serving/tensorizer
-serving/runai_model_streamer
-serving/engine_args
-serving/env_vars
-serving/usage_stats
-```
-
 ```{toctree}
 :caption: Models
 :maxdepth: 1
 
-models/supported_models
 models/generative_models
 models/pooling_models
+models/supported_models
+models/extensions/index
 ```
 
 ```{toctree}
@@ -99,7 +81,6 @@ models/pooling_models
 
 features/quantization/index
 features/lora
-features/multimodal_inputs
 features/tool_calling
 features/structured_outputs
 features/automatic_prefix_caching
@@ -108,6 +89,32 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+```{toctree}
+:caption: Inference and Serving
+:maxdepth: 1
+
+serving/offline_inference
+serving/openai_compatible_server
+serving/multimodal_inputs
+serving/distributed_serving
+serving/metrics
+serving/engine_args
+serving/env_vars
+serving/usage_stats
+serving/integrations/index
+```
+
+```{toctree}
+:caption: Deployment
+:maxdepth: 1
+
+deployment/docker
+deployment/k8s
+deployment/nginx
+deployment/frameworks/index
+deployment/integrations/index
+```
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
new file mode 100644
index 0000000000000..cff09d12eba47
--- /dev/null
+++ b/docs/source/models/extensions/index.md
@@ -0,0 +1,8 @@
+# Built-in Extensions
+
+```{toctree}
+:maxdepth: 1
+
+runai_model_streamer
+tensorizer
+```
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
similarity index 98%
rename from docs/source/serving/runai_model_streamer.md
rename to docs/source/models/extensions/runai_model_streamer.md
index d4269050ff574..fe2701194a604 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -1,6 +1,6 @@
 (runai-model-streamer)=
 
-# Loading Models with Run:ai Model Streamer
+# Loading models with Run:ai Model Streamer
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
diff --git a/docs/source/serving/tensorizer.md b/docs/source/models/extensions/tensorizer.md
similarity index 95%
rename from docs/source/serving/tensorizer.md
rename to docs/source/models/extensions/tensorizer.md
index d3dd29d48f730..42ed5c795dd27 100644
--- a/docs/source/serving/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -1,6 +1,6 @@
 (tensorizer)=
 
-# Loading Models with CoreWeave's Tensorizer
+# Loading models with CoreWeave's Tensorizer
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94a8849f7edcd..590bea992d1fc 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,9 +1,9 @@
 (supported-models)=
 
-# Supported Models
+# List of Supported Models
 
 vLLM supports generative and pooling models across various tasks.
-If a model supports more than one task, you can set the task via the {code}`--task` argument.
+If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
@@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it.
 
 By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
-If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
 ````{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -48,7 +48,7 @@ To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFac
 $ export VLLM_USE_MODELSCOPE=True
 ```
 
-And use with {code}`trust_remote_code=True`.
+And use with `trust_remote_code=True`.
 
 ```python
 from vllm import LLM
@@ -420,15 +420,15 @@ you should explicitly specify the task type to ensure that the model is used in
 ```
 
 ```{note}
-{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
 ```
 
 ```{note}
-Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
-On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
 ```
 
@@ -468,8 +468,8 @@ If your model is not in the above list, we will try to automatically convert the
 {func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
-For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 ```
 
 #### Classification (`--task classify`)
@@ -537,13 +537,13 @@ The following modalities are supported depending on the model:
 - **V**ideo
 - **A**udio
 
-Any combination of modalities joined by {code}`+` are supported.
+Any combination of modalities joined by `+` are supported.
 
-- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
 
-On the other hand, modalities separated by {code}`/` are mutually exclusive.
+On the other hand, modalities separated by `/` are mutually exclusive.
 
-- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
@@ -731,8 +731,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
-To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
-or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
 
 ```python
 llm = LLM(
@@ -751,11 +751,11 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
-The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
+The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
@@ -770,7 +770,7 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
+Any text generation model can be converted into an embedding model by passing `--task embed`.
 
 ```{note}
 To get the best results, you should use pooling models that are specifically trained as such.
@@ -818,7 +818,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
 ```{tip}
-When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
 ```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 6fbc1ea104678..b1703249d7224 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
 ```
 
-## Details for Distributed Inference and Serving
+## Running vLLM on a single node
 
 vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
-To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
 
 ```python
 from vllm import LLM
@@ -32,14 +32,14 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
 output = llm.generate("San Franciso is a")
 ```
 
-To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
 
 ```console
 $ vllm serve facebook/opt-13b \
 $     --tensor-parallel-size 4
 ```
 
-You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 ```console
 $ vllm serve gpt2 \
@@ -47,7 +47,7 @@ $     --tensor-parallel-size 4 \
 $     --pipeline-parallel-size 2
 ```
 
-## Multi-Node Inference and Serving
+## Running vLLM on multiple nodes
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md
deleted file mode 100644
index d214c77254257..0000000000000
--- a/docs/source/serving/integrations.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Integrations
-
-```{toctree}
-:maxdepth: 1
-
-run_on_sky
-deploying_with_kserve
-deploying_with_kubeai
-deploying_with_triton
-deploying_with_bentoml
-deploying_with_cerebrium
-deploying_with_lws
-deploying_with_dstack
-serving_with_langchain
-serving_with_llamaindex
-serving_with_llamastack
-```
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
new file mode 100644
index 0000000000000..371c284981ce9
--- /dev/null
+++ b/docs/source/serving/integrations/index.md
@@ -0,0 +1,8 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+langchain
+llamaindex
+```
diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/integrations/langchain.md
similarity index 82%
rename from docs/source/serving/serving_with_langchain.md
rename to docs/source/serving/integrations/langchain.md
index 96bd5943f3d64..49ff6e0c32a72 100644
--- a/docs/source/serving/serving_with_langchain.md
+++ b/docs/source/serving/integrations/langchain.md
@@ -1,10 +1,10 @@
-(run-on-langchain)=
+(serving-langchain)=
 
-# Serving with Langchain
+# LangChain
 
-vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) .
+vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
-To install langchain, run
+To install LangChain, run
 
 ```console
 $ pip install langchain langchain_community -q
diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/integrations/llamaindex.md
similarity index 74%
rename from docs/source/serving/serving_with_llamaindex.md
rename to docs/source/serving/integrations/llamaindex.md
index 98859d8e3f828..9961c181d7e1c 100644
--- a/docs/source/serving/serving_with_llamaindex.md
+++ b/docs/source/serving/integrations/llamaindex.md
@@ -1,10 +1,10 @@
-(run-on-llamaindex)=
+(serving-llamaindex)=
 
-# Serving with llama_index
+# LlamaIndex
 
-vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) .
+vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
-To install llamaindex, run
+To install LlamaIndex, run
 
 ```console
 $ pip install llama-index-llms-vllm -q
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 2dc78643f6d8f..e6ded2e6dd465 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 
-You can start the server using Python, or using [Docker](deploying_with_docker.md):
+You can start the server using Python, or using [Docker](#deployment-docker):
 
 ```console
 $ vllm serve unsloth/Llama-3.2-1B-Instruct
diff --git a/docs/source/features/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
similarity index 95%
rename from docs/source/features/multimodal_inputs.md
rename to docs/source/serving/multimodal_inputs.md
index 4f45a9f448cf0..0efa09f2869ca 100644
--- a/docs/source/features/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -18,7 +18,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 
 ### Image
 
-You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples:
+You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
@@ -122,21 +122,21 @@ for o in outputs:
 
 ### Video
 
-You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
+You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
 Full example: <gh-file:examples/offline_inference_vision_language.py>
 
 ### Audio
 
-You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
+You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
 Full example: <gh-file:examples/offline_inference_audio_language.py>
 
 ### Embedding
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
 # Inference with image embeddings as input
@@ -294,7 +294,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 ### Video
 
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
+Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
 First, launch the OpenAI-compatible server:
 
@@ -418,7 +418,7 @@ result = chat_completion_from_base64.choices[0].message.content
 print("Chat completion output from input audio:", result)
 ```
 
-Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
 
 ```python
 chat_completion_from_url = client.chat.completions.create(
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
new file mode 100644
index 0000000000000..83178f7811825
--- /dev/null
+++ b/docs/source/serving/offline_inference.md
@@ -0,0 +1,79 @@
+(offline-inference)=
+
+# Offline Inference
+
+You can run vLLM in your own code on a list of prompts.
+
+The offline API is based on the {class}`~vllm.LLM` class.
+To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
+
+For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
+and runs it in vLLM using the default configuration.
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+After initializing the `LLM` instance, you can perform model inference using various APIs.
+The available APIs depend on the type of model that is being run:
+
+- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
+- [Pooling models](#pooling-models) output their hidden states directly.
+
+Please refer to the above pages for more details about each API.
+
+```{seealso}
+[API Reference](/dev/offline_inference/offline_index)
+```
+
+## Configuration Options
+
+This section lists the most common options for running the vLLM engine.
+For a full list, refer to the [Engine Arguments](#engine-args) page.
+
+### Reducing memory usage
+
+Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
+
+#### Tensor Parallelism (TP)
+
+Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
+
+The following code splits the model across 2 GPUs.
+
+```python
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+          tensor_parallel_size=2)
+```
+
+```{important}
+To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
+before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+
+To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
+```
+
+#### Quantization
+
+Quantized models take less memory at the cost of lower precision.
+
+Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
+and used directly without extra configuration.
+
+Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
+
+#### Context length and batch size
+
+You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
+and the maximum batch size (`max_num_seqs` option).
+
+```python
+llm = LLM(model="adept/fuyu-8b",
+          max_model_len=2048,
+          max_num_seqs=2)
+```
+
+### Performance optimization and tuning
+
+You can potentially improve the performance of vLLM by finetuning various options.
+Please refer to [this guide](#optimization-and-tuning) for more details.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 97e9879075570..1e5ea6357d202 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,8 +1,10 @@
-# OpenAI Compatible Server
+(openai-compatible-server)=
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
+# OpenAI-Compatible Server
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index 3d02fbab9216e..cfc3cb2576873 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -45,7 +45,7 @@ You can preview the collected data by running the following command:
 tail ~/.config/vllm/usage_stats.json
 ```
 
-## Opt-out of Usage Stats Collection
+## Opting out
 
 You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 

From b278557935d78b337fb5e82a32b02f75678b4101 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 7 Jan 2025 12:01:39 +0800
Subject: [PATCH 008/115] [Kernel][LoRA]Punica prefill  kernels fusion (#11234)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abatom@163.com>
---
 .buildkite/test-pipeline.yaml          |   3 +-
 tests/lora/test_minicpmv.py            |  77 --------
 tests/lora/test_minicpmv_tp.py         |  63 +++++--
 tests/lora/test_punica_sizes.py        | 166 ++++++++---------
 tests/lora/test_punica_variation.py    | 168 +++++++++--------
 tests/lora/utils.py                    | 144 ++++++++++++---
 vllm/lora/ops/sgmv_expand.py           | 205 +++++++++++++--------
 vllm/lora/ops/sgmv_expand_slice.py     | 241 -------------------------
 vllm/lora/ops/sgmv_shrink.py           | 129 +++++++------
 vllm/lora/ops/utils.py                 | 121 ++++++++++++-
 vllm/lora/punica_wrapper/punica_gpu.py | 154 ++++++----------
 11 files changed, 707 insertions(+), 764 deletions(-)
 delete mode 100644 tests/lora/test_minicpmv.py
 delete mode 100644 vllm/lora/ops/sgmv_expand_slice.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 529daf54faecf..dcfe228ce8eae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -242,7 +242,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -535,6 +535,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
deleted file mode 100644
index 78bf5a1617233..0000000000000
--- a/tests/lora/test_minicpmv.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from typing import List
-
-import pytest
-
-import vllm
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
-
-PROMPT_TEMPLATE = (
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
-    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
-
-IMAGE_ASSETS = [
-    ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
-]
-
-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
-    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
-]
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=5,
-        stop_token_ids=[128001, 128009],  # eos_id, eot_id
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-def test_minicpmv_lora(minicpmv_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=64,
-        trust_remote_code=True,
-        enable_chunked_prefill=True,
-    )
-    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 930f177953a5f..3b0f18325a40b 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -3,10 +3,10 @@
 import pytest
 
 import vllm
+from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-
-from ..utils import multi_gpu_test
+from vllm.platforms import current_platform
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
@@ -17,13 +17,11 @@
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
 ]
 
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
     "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
 ]
 
 
@@ -50,48 +48,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
         max_loras=4,
         max_lora_rank=64,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        enforce_eager=True,
         enable_chunked_prefill=True,
     )
-
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
-@multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
-        max_loras=4,
-        max_lora_rank=64,
+        max_loras=2,
+        max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        fully_sharded_loras=True,
         enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
index 66b5f82bbb97d..0351fedd1cfa5 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -4,6 +4,8 @@
 whether the corresponding Triton kernel can run normally when tensor parallelism
 is set to [1, 2, 4, 8, 16, 32, 64].
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [
     128,
@@ -112,14 +115,7 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+_dict_lock = Lock()
 
 
 @pytest.mark.parametrize("batches", BATCHES)
@@ -127,6 +123,7 @@ def assert_close(a, b):
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -137,6 +134,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -148,19 +146,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -172,43 +171,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -292,25 +312,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -330,41 +347,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
index 3b20033271d26..9ee10e7c23ee6 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -3,6 +3,8 @@
 under different conditions, including various batches, numbers of LoRA , and
 maximum ranks.
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 import vllm.lora.ops.bgmv_expand_slice
 import vllm.lora.ops.bgmv_shrink
 import vllm.lora.ops.sgmv_expand
-import vllm.lora.ops.sgmv_expand_slice
 import vllm.lora.ops.sgmv_shrink  # noqa: F401
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [4097]
 
@@ -28,31 +31,23 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
 # Unlike test_punica_sizes.py, we directly utilize custom op for
 # testing, which verifies the correct registration of these ops.
 bgmv_expand = torch.ops.vllm.bgmv_expand
 bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
 bgmv_shrink = torch.ops.vllm.bgmv_shrink
 sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
 sgmv_shrink = torch.ops.vllm.sgmv_shrink
 
+_dict_lock = Lock()
+
 
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -63,6 +58,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -74,19 +70,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -98,43 +95,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -220,24 +238,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -257,40 +273,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index e394c33b3f9ea..b66d18074a7bf 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -18,11 +18,13 @@ def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
     def get_module_lora(self, module_name: str) -> LoRALayerWeights:
         return self._loras[module_name]
 
-    def init_random_lora(self,
-                         module_name: str,
-                         weight: torch.Tensor,
-                         rank: int = 8,
-                         generate_embeddings_tensor: int = 0):
+    def init_random_lora(
+        self,
+        module_name: str,
+        weight: torch.Tensor,
+        rank: int = 8,
+        generate_embeddings_tensor: int = 0,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -35,21 +37,25 @@ def init_random_lora(self,
                               device=self._device),
         )
         if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(5,
-                                                generate_embeddings_tensor,
-                                                dtype=weight.dtype,
-                                                device=self._device)
+            lora.embeddings_tensor = torch.rand(
+                5,
+                generate_embeddings_tensor,
+                dtype=weight.dtype,
+                device=self._device,
+            )
         self.set_module_lora(module_name, lora)
 
         return lora
 
-    def init_lora(self,
-                  module_name: str,
-                  input_dim: int,
-                  output_dim: int,
-                  rank=8,
-                  noop=False,
-                  embeddings_tensor=None):
+    def init_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dim: int,
+        rank=8,
+        noop=False,
+        embeddings_tensor=None,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -125,8 +131,16 @@ def ref_torch_groupgemm(
     return
 
 
-def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
-                  op_type, device):
+def generate_data(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    op_type,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -187,8 +201,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
     )
 
 
-def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
-                                     seq_length, dtype, nslices, device):
+def generate_data_for_expand_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    nslices,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -221,7 +243,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
         indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    nslices,
+    dtype,
+    op_type,
+    device,
+):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+
+    lora_weights_lst = []
+    if op_type == "shrink":
+
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+
+        for _ in range(nslices):
+            if op_type == "shrink":
+                lora_weights_lst.append(
+                    torch.rand(
+                        (lora_nums, max_rank, hidden_size),  # col-major
+                        dtype=dtype,
+                    ).to(device))
+        # NOTE  shrink kernel using torch.float32 as output type
+        # shrink op need atomic_add, so output is initinized by 0
+        our_out_tensor = torch.zeros(
+            (nslices, total_tokens, max_rank),
+            dtype=torch.float32,
+        ).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (nslices, total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+
+    # Ensure the same input.
+    ref_out_tensor = our_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 77c5178493c44..8af44b703810b 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -1,66 +1,109 @@
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_b_ptr
+
 
 @triton.jit
 def _sgmv_expand_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        slice_start_loc,
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,  # 1
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,  # 1
+        output_d0_stride,
+        output_d1_stride,  # 1
+        output_hs_ptr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ADD_INPUTS: tl.constexpr,
+        CAST_TYPE: tl.constexpr,
+        SLICE_NUM: tl.constexpr,
+        SAME_STRIDE: tl.constexpr):
     """
-    The sgmv's expand triton kernel is based on GroupGEMM.
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
+    might be that in the future, we could implement a fusion operator to
+    achieve the current functionality instead of having to call it multiple
+    times.
     """
     pid = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
+    slice_id = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
+    # When the output dimensions of each slice are the same,cur_n=N, otherwise
+    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
+    # qkv linear.
+    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
         return
+    if pid_n * BLOCK_N > curr_N:
+        return
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
+
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N),
+                            BLOCK_N)
+    # ls_d*_ptr can be either an integer or a pointer
+    if SAME_STRIDE:
+        # integer
+        cur_lora_d0_stride = ls_d0_ptr
+        cur_lora_d1_stride = ls_d1_ptr
+        cur_lora_d2_stride = ls_d2_ptr
+    else:
+        # pointer
+        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
+        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
+        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
+    if SLICE_NUM == 1:
+        cur_input_ptr = input_ptr
+        cur_lora_ptr = lora_ptr
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    else:
+        cur_input_ptr = input_ptr + slice_id * input_d0_stride
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(out_ptr.dtype.element_ty))
+
+    a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride +
+             ram[:, None] * input_d1_stride +
+             offset_k[None, :] * input_d2_stride, )
+    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
+             offset_k[:, None] * cur_lora_d2_stride +
+             rbn[None, :] * cur_lora_d1_stride)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(tl.cdiv(K, BLOCK_K)):
         if EVEN_K:
@@ -74,26 +117,30 @@ def _sgmv_expand_kernel(
                               mask=offset_k[:, None] < K - k * BLOCK_K,
                               other=0)
         if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+            tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty)
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
         )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+        a_ptr += BLOCK_K * input_d2_stride
+        b_ptr += BLOCK_K * cur_lora_d2_stride
+
+    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
+    if SLICE_NUM == 1:
+        cur_slice_start = slice_start_loc
+    else:
+        cur_slice_start = tl.load(slice_start_loc + slice_id)
+
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
+    c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
+             offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
     c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+              (cur_seq_start + M)) & (offset_cn[None, :] <
+                                      (cur_slice_start + curr_N))
     if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
+        tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
@@ -101,7 +148,7 @@ def _sgmv_expand_kernel(
 @torch.inference_mode()
 def _sgmv_expand(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -109,17 +156,18 @@ def _sgmv_expand(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     """
     Args:
         inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
+        lora_b_weights (List[torch.Tensor]): lora'b weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
             into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
+            [0, 4].
         seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
             length of the sequences in the batch.
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
@@ -130,77 +178,80 @@ def _sgmv_expand(
             batch.
         token_nums (int): The token numbers in the batch. Used to verify if the 
             token numbers in the inputs matches the one in the metadata.
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
+        offset_start (int, optional): Offset start for output_tensor. 
+            Defaults to 0.
+        add_inputs (bool, optional): Whether to add the input tensor to the 
+            output tensor. Defaults to False.
     """
-
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
+    for weight in lora_b_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(1) == token_nums
+    assert inputs.size(0) == len(lora_b_weights)
+
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
-    assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
+    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
+     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
+     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
+                                           b_seq_start_loc.device)
 
     # TODO tuning this config
+    K = lora_b_weights[0].shape[-1]  # K= rank
 
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_M = 32
-    BLOCK_N = 32
+    BLOCK_M = 64
+    BLOCK_N = 128
     BLOCK_K = 16
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+
+    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
             torch.float16,
             torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
         batches,
+        len(lora_b_weights),
     )
     _sgmv_expand_kernel[grid](
         inputs,
-        lora_b_weights,
+        lora_ptr_tensor,
         output_tensor,
-        N,
+        MAX_N,
         K,
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
+        slice_start_tensor,
         inputs.stride(0),
         inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
+        inputs.stride(2),
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        hidden_sizes_tensor,
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
+        len(lora_b_weights),
+        same_stride,
     )
     return
 
 
-def sgmv_expand_fake(
+def _sgmv_expand_fake(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -208,18 +259,18 @@ def sgmv_expand_fake(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     return
 
 
 try:
-
     direct_register_custom_op(
         op_name="sgmv_expand",
         op_func=_sgmv_expand,
         mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_fake,
+        fake_impl=_sgmv_expand_fake,
     )
     sgmv_expand = torch.ops.vllm.sgmv_expand
 
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
deleted file mode 100644
index 55c4fb68ed128..0000000000000
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-
-@triton.jit
-def _sgmv_expand_slice_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    slice_offset,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
-    """
-
-    Similar to the 'sgmv_expand' operator, but with an added parameter 
-    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
-    might be that in the future, we could implement a fusion operator to 
-    achieve the current functionality instead of having to call it multiple 
-    times.
-    """
-    pid = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    cta_n_num = tl.cdiv(N, BLOCK_N)
-    pid_m = pid // cta_n_num
-    pid_n = pid % cta_n_num
-    M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M > M:
-        return
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
-    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = tl.arange(0, BLOCK_K)
-    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :] < K - k * BLOCK_K,
-                              other=0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None] < K - k * BLOCK_K,
-                              other=0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
-    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
-    M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
-                                                           (slice_offset + N))
-    if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
-        tiled_c += tiled_out
-    tl.store(c_ptr, tiled_c, mask=c_mask)
-
-
-@torch.inference_mode()
-def _sgmv_expand_slice(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    """_summary_
-
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
-        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
-            length of the sequences in the batch
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        max_seq_length (int): The max sequence lengths of the sequences
-            in the batch
-        token_nums (int): The token numbers in the batch. Used to verify if the 
-            token numbers in the inputs matches the one in the metadata.
-        slice_offset (int): output_tensor's offset
-        slice_size (int): current output_tensor's size
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
-    """
-
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batches
-    assert lora_indices_tensor.size(0) == batches
-    assert slice_size == lora_b_weights.size(-2)
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
-
-    # TODO tuning this config
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-
-    BLOCK_M = 32
-    BLOCK_N = 32
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-    grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        batches,
-    )
-    _sgmv_expand_slice_kernel[grid](
-        inputs,
-        lora_b_weights,
-        output_tensor,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        slice_offset,
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
-    )
-    return
-
-
-def sgmv_expand_slice_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="sgmv_expand_slice",
-        op_func=_sgmv_expand_slice,
-        mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_slice_fake,
-    )
-    sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
-
-except AttributeError:
-    sgmv_expand_slice = _sgmv_expand_slice
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 37d1dc84eebca..3d2ebe8286f56 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -5,48 +5,60 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_a_ptr
+
 
 @triton.jit
 def _sgmv_shrink_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    scaling,
-    xm_stride,  # hidden_size
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,  #1-3
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        scaling,
+        input_d0_stride,
+        input_d1_stride,  # 1
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,  # 1
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,  # 1 
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        SPLIT_K: tl.constexpr,
+        SLICE_NUM: tl.constexpr):
     """
     The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
     The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
     introducing SPLIT-K can improve performance
     """
     pid = tl.program_id(axis=0)
-    pid_sk = tl.program_id(axis=1)
+    pid_mix = tl.program_id(axis=1)
     cur_batch = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
+    if SLICE_NUM == 1:
+        slice_id: tl.constexpr = 0
+        pid_sk = tl.program_id(axis=1)
+    else:
+        pid_mix = tl.program_id(axis=1)
+        slice_id = pid_mix // SPLIT_K
+        pid_sk = pid_mix % SPLIT_K
 
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
@@ -61,11 +73,22 @@ def _sgmv_shrink_kernel(
 
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    # input ptr
+    a_ptr = (input_ptr + cur_seq_start * input_d0_stride +
+             ram[:, None] * input_d0_stride +
+             offset_k[None, :] * input_d1_stride)
+
+    if SLICE_NUM == 1:
+        # current lora ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        # current lora ptr
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(input_ptr.dtype.element_ty))
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride)
-    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
-             offset_k[:, None] * lora_n_stride)
+    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
+             rbn[None, :] * lora_d1_stride +
+             offset_k[:, None] * lora_d2_stride)
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
@@ -82,13 +105,15 @@ def _sgmv_shrink_kernel(
                               other=0.0)
         accumulator += tl.dot(tiled_a, tiled_b)
 
-        a_ptr += BLOCK_K * SPLIT_K * xk_stride
-        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+        a_ptr += BLOCK_K * SPLIT_K * input_d1_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
 
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
+                   slice_id * output_d0_stride)
+    c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
+        None, :] * output_d2_stride
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
     accumulator *= scaling
@@ -102,7 +127,7 @@ def _sgmv_shrink_kernel(
 @torch.inference_mode()
 def _sgmv_shrink(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -113,10 +138,9 @@ def _sgmv_shrink(
     scaling: float,
 ) -> None:
     """
-
     Args:
         inputs (torch.Tensor): input tensor
-        lora_a_weights (torch.Tensor): lora'a weight
+        lora_a_weights (List[torch.Tensor]): lora'a weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
@@ -134,27 +158,21 @@ def _sgmv_shrink(
             token numbers in the inputs matches the one in the metadata.
         scaling (float): Scaling factor.
     """
-    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
-    assert lora_a_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
     assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
     assert inputs.is_contiguous()
-
-    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
-        assert lora_a_weights.size(1) == 1
-        lora_a_weights = lora_a_weights.squeeze(dim=1)
-    else:
-        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
-    assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, b_seq_start_loc.device)
     # TODO tuning this config
-    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
     BLOCK_M = 32
     BLOCK_N = 16
     BLOCK_K = 32
@@ -162,13 +180,12 @@ def _sgmv_shrink(
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
     grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        SPLIT_K,
+        SPLIT_K * len(lora_a_weights),
         batches,
     )
-
     _sgmv_shrink_kernel[grid](
         inputs,
-        lora_a_weights,
+        lora_ptr_tensor,
         output_tensor,
         N,
         K,
@@ -178,23 +195,25 @@ def _sgmv_shrink(
         scaling,
         inputs.stride(0),
         inputs.stride(1),
-        lora_a_weights.stride(0),
-        lora_a_weights.stride(1),
-        lora_a_weights.stride(2),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        output_tensor.stride(2),
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
+        len(lora_a_weights),
     )
     return
 
 
 def sgmv_shrink_fake(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 7c3e27313ad97..7df5bc2c225e5 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,5 +1,7 @@
 import functools
-from typing import Dict
+from typing import Dict, List, Tuple
+
+import torch
 
 
 @functools.lru_cache
@@ -44,3 +46,120 @@ def get_lora_op_configs(op_type: str, batch: int,
     if not config:
         config = _get_default_config(op_type, batch, hidden_size)
     return config
+
+
+_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+
+
+def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
+    """
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
+
+    if values := _LORA_A_PTR_DICT.get(key):
+        return values
+
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    tensor_ptrs = []
+    for lora_a_weight in lora_a_weights:
+        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_a_weight.size(1) == 1
+            lora_a_weight = lora_a_weight.squeeze(dim=1)
+        else:
+            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_a_weight.is_contiguous()
+        tensor_ptrs.append(lora_a_weight.data_ptr())
+        lora_strides_d0.append(lora_a_weight.stride(0))
+        lora_strides_d1.append(lora_a_weight.stride(1))
+        lora_strides_d2.append(lora_a_weight.stride(2))
+    if len(lora_a_weights) > 1:
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    else:
+        lora_ptr_tensor = lora_a_weights[0]
+
+    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
+            or len(set(lora_strides_d2)) > 1):
+        raise ValueError("All LoRA weights must have the same stride.")
+
+    _LORA_A_PTR_DICT[key] = (
+        lora_ptr_tensor,
+        lora_strides_d0[0],
+        lora_strides_d1[0],
+        lora_strides_d2[0],
+    )
+    return _LORA_A_PTR_DICT.get(key)
+
+
+def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
+                    device: str):
+    """ 
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+
+    """
+
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+    if values := _LORA_B_PTR_DICT.get(key):
+        return values
+    slice_offset_lst = []
+    tensor_ptrs = []
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    hidden_sizes = []
+    slice_offset = offset_start
+    for lora_b_weight in lora_weights:
+        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_b_weight.size(1) == 1
+            lora_b_weight = lora_b_weight.squeeze(dim=1)
+        else:
+            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_b_weight.is_contiguous()
+        tensor_ptrs.append(lora_b_weight.data_ptr())
+        lora_strides_d0.append(lora_b_weight.stride(0))
+        lora_strides_d1.append(lora_b_weight.stride(1))
+        lora_strides_d2.append(lora_b_weight.stride(2))
+        slice_offset_lst.append(slice_offset)
+        slice_offset += lora_b_weight.size(1)
+        hidden_sizes.append(lora_b_weight.size(1))
+
+    if len(lora_weights) > 1:
+        # note these are device tensors
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+    else:
+        slice_start_tensor = slice_offset_lst[0]
+        lora_ptr_tensor = lora_b_weight[0]
+
+    # If each lora has the same stride, there's no need to use a
+    # tensor for storage.
+    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
+            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+        lora_strides_d0_tensor = lora_strides_d0[0]
+        lora_strides_d1_tensor = lora_strides_d1[0]
+        lora_strides_d2_tensor = lora_strides_d2[0]
+        hidden_sizes_tensor = hidden_sizes[0]
+        same_stride = True
+
+    else:
+        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
+        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
+        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
+        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
+        same_stride = False
+    # MAX_N is the maximum hidden size among all the lora_b weights
+    MAX_N = max(hidden_sizes)
+    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
+                             lora_strides_d0_tensor, lora_strides_d1_tensor,
+                             lora_strides_d2_tensor, hidden_sizes_tensor,
+                             same_stride, MAX_N)
+    return _LORA_B_PTR_DICT.get(key)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index de378df8b3cfa..278f7b5a8e9f4 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -5,7 +5,7 @@
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Callable, Optional, Tuple, Union, final
+from typing import Optional, Tuple, Union, final
 
 import torch
 
@@ -16,7 +16,6 @@
     from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
     from vllm.lora.ops.bgmv_shrink import bgmv_shrink
     from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
     from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
@@ -35,11 +34,11 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
 
-    def _shrink_prefill(
+    def _apply_shrink_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
         scale: float,
     ):
         #No LoRA request, so return directly
@@ -53,7 +52,7 @@ def _shrink_prefill(
             scale,
         )
 
-    def _shrink_decode(
+    def _apply_shrink_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -62,56 +61,28 @@ def _shrink_decode(
     ):
         bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
 
-    def _expand_prefill(
+    def _apply_expand_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
+        offset_start: int,
         add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
             return
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_inputs,
-        )
-
-    def _expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_inputs: bool,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
-    def _expand_slice_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand_slice(
+        sgmv_expand(
             x,
             w_t_all,
             y,
             *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_inputs,
+            offset_start=offset_start,
+            add_inputs=add_inputs,
         )
 
-    def _expand_slice_decode(
+    def _apply_expand_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -123,43 +94,6 @@ def _expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_inputs)
 
-    def _apply_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool = True,
-    ):
-        """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
-        computation, which is suitable for the
-        GEMM of lora'b.
-        """
-
-        expand_slice_fun: Callable = (self._expand_slice_prefill
-                                      if self.is_prefill else
-                                      self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
-
-    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                      w_t_all: torch.Tensor, scale: float):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self._shrink_prefill
-                                if self.is_prefill else self._shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-        y = y.view_as(y_org)
-
     def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
                    x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
                    scale: float, **kwargs):
@@ -182,10 +116,15 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
         """
 
         x = x.view(-1, x.shape[-1])
-        # TODO fuse these kernels
-        for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
+
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_shrink_prefill(y, x, lora_a_stacked, scale)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_a_stacked)):
+                self._apply_shrink_decode(y[slice_idx], x,
+                                          lora_a_stacked[slice_idx], scale)
 
     def add_expand(self,
                    y: torch.Tensor,
@@ -217,20 +156,28 @@ def add_expand(self,
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        offset_left = offset_start
         if lora_bias_stacked is not None:
             self._apply_bias(self.token_lora_indices, y, output_slices,
                              lora_bias_stacked)
-        for slice_idx in range(len(lora_b_stacked)):
-            self._apply_expand(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                add_inputs=add_inputs,
-            )
-            offset_left += output_slices[slice_idx]
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_expand_prefill(y,
+                                       x,
+                                       lora_b_stacked,
+                                       offset_start,
+                                       add_inputs=True)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_b_stacked)):
+                self._apply_expand_decode(
+                    y,
+                    x[slice_idx],
+                    lora_b_stacked[slice_idx],
+                    offset_start,
+                    output_slices[slice_idx],
+                    add_inputs=add_inputs,
+                )
+                offset_start += output_slices[slice_idx]
         y = y.view_as(y_org)
 
     def add_lora_embedding(self,
@@ -252,10 +199,18 @@ def add_lora_embedding(self,
             add_inputs (bool): Default to True.
         """
 
-        # Embedding layer only need expand op
-        expand_fun: Callable = (self._expand_prefill
-                                if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_inputs)
+        if self.is_prefill:
+            sgmv_expand(
+                x.unsqueeze(dim=0),
+                [lora_b_stacked],
+                y,
+                *self.prefill_metadata,
+                offset_start=0,
+                add_inputs=add_inputs,
+            )
+        else:
+            bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
+                        add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -301,10 +256,11 @@ def add_lora_linear(self,
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default ,refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = tuple(
-                torch.zeros(
-                    (x.size(0), r), dtype=torch.float32, device=x.device)
-                for _ in range(len(output_slices)))
+            buffer = torch.zeros(
+                (len(output_slices), x.size(0), r),
+                dtype=torch.float32,
+                device=x.device,
+            )
         self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
         self.add_expand(y,
                         buffer,

From 0f3f3c86ec44467fa80b60cb9f971f9ede028f76 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 20:36:24 -0800
Subject: [PATCH 009/115] [Bugfix] Update attention interface in `Whisper`
 (#11784)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/whisper.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index cb54b4c3ba663..c1f3bb0ca33c2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -106,6 +106,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
+            attn_type=self.attn_type,
         )
 
     def _init_qkv(
@@ -134,12 +135,7 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=self.attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
 
@@ -164,6 +160,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=prefix,
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def _init_qkv(
@@ -207,12 +204,13 @@ def forward(
         else:
             k = v = None
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            kv_cache,
+            attn_metadata,
+        )
 
         output, _ = self.out_proj(attn_output)
 
@@ -734,4 +732,4 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_weights = [(name, loaded_weight)
                           for name, loaded_weight in weights]
         mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
-        return loader.load_weights(loaded_weights, mapper=mapper)
\ No newline at end of file
+        return loader.load_weights(loaded_weights, mapper=mapper)

From 898cdf033e31dc28042f7181b1565c78d905196e Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 6 Jan 2025 21:36:10 -0800
Subject: [PATCH 010/115] [CI] Fix neuron CI and run offline tests (#11779)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 .buildkite/run-neuron-test.sh        | 53 ++++++++++++++--------------
 Dockerfile.neuron                    |  8 +++--
 examples/offline_inference_neuron.py | 11 ++----
 3 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 9259391aaed49..aa29c434e7cfb 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -3,6 +3,18 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     last_build=$(cat /tmp/neuron-docker-build-timestamp)
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker image prune -f
         docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container
 
 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 269139fe90f0b..e9cb82889decd 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -15,8 +15,8 @@ RUN apt-get update && \
         ffmpeg libsm6 libxext6 libgl1
 
 ### Mount Point ###
-# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+# When launching the container, mount the code directory to /workspace
+ARG APP_MOUNT=/workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 
@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 
+# overwrite entrypoint to run bash script
+RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+
 CMD ["/bin/bash"]
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index 2856be7c864ea..f098c8e5fed1e 100644
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -1,12 +1,5 @@
-import os
-
 from vllm import LLM, SamplingParams
 
-# creates XLA hlo graphs for all the context length buckets.
-os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -26,8 +19,8 @@
     # Currently, this is a known limitation in continuous batching support
     # in transformers-neuronx.
     # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
+    max_model_len=1024,
+    block_size=1024,
     # The device can be automatically detected when AWS Neuron SDK is installed.
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.

From e512f76a898d61b8857b36b138bb9cf93ea04d03 Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Tue, 7 Jan 2025 14:12:48 +0800
Subject: [PATCH 011/115] fix init error for MessageQueue when n_local_reader
 is zero (#11768)

---
 vllm/distributed/device_communicators/shm_broadcast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9f97b0f01ad8a..4ced991f62f66 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -247,7 +247,8 @@ def __init__(
         self.handle = Handle(
             connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
-            buffer_handle=self.buffer.handle(),
+            buffer_handle=self.buffer.handle()
+            if self.buffer is not None else None,
             local_subscribe_port=local_subscribe_port,
             remote_subscribe_port=remote_subscribe_port,
         )

From ce1917fcf211458dfbe6bb86d6a9d2d9bd346e63 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 7 Jan 2025 01:57:32 -0500
Subject: [PATCH 012/115] [Doc] Create a vulnerability management team (#9925)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md                                   |  2 +-
 .../contributing/vulnerability_management.md  | 43 +++++++++++++++++++
 docs/source/index.md                          |  1 +
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/contributing/vulnerability_management.md

diff --git a/SECURITY.md b/SECURITY.md
index ad3f1f16ab560..de0032d26c87b 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@
 
 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
 
 ---
 
diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
new file mode 100644
index 0000000000000..422dc13e6a644
--- /dev/null
+++ b/docs/source/contributing/vulnerability_management.md
@@ -0,0 +1,43 @@
+# Vulnerability Management
+
+## Reporting Vulnerabilities
+
+As mentioned in the [security
+policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
+vulnerabilities may be reported privately to the project via
+[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
+
+## Vulnerability Management Team
+
+Once a vulnerability has been reported to the project, the Vulnerability
+Management Team (VMT) is responsible for managing the vulnerability. The VMT is
+responsible for:
+
+- Triaging the vulnerability.
+- Coordinating with reporters and project maintainers on vulnerability analysis
+  and resolution.
+- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
+- Coordination with project maintainers on a coordinated release of the fix and
+  security advisory.
+
+### Security Advisories
+
+Advisories are published via GitHub through the same system used to report
+vulnerabilities. More information on the process can be found in the [GitHub
+documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
+
+### Team Members
+
+We prefer to keep all vulnerability-related communication on the security report
+on GitHub. However, if you need to contact the VMT directly for an urgent issue,
+you may contact the following individuals:
+
+- Simon Mo - simon.mo@hey.com
+- Russell Bryant - rbryant@redhat.com
+
+## Slack Discussion
+
+You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+to discuss security-related topics. However, please do not disclose any
+vulnerabilities in this channel. If you need to report a vulnerability, please
+use the GitHub security advisory system or contact a VMT member privately.
diff --git a/docs/source/index.md b/docs/source/index.md
index c335155bd6e14..11d3e24a9b60a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -169,6 +169,7 @@ contributing/overview
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
+contributing/vulnerability_management
 ```
 
 # Indices and tables

From 1e4ce295ae70771f8e0eaa50962b3dda29c3f0d6 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Tue, 7 Jan 2025 15:28:01 +0800
Subject: [PATCH 013/115] [CI][CPU] adding build number to docker image name
 (#11788)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 4f1729d46dae2..a4eca078568fd 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
 
   # offline inference
-  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
     python3 examples/offline_inference.py"
 
   # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -46,26 +46,26 @@ function cpu_tests() {
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1

From 8082ad7950ad96fdc15e6b5a42e8098dd7087f6f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 01:55:39 -0800
Subject: [PATCH 014/115] [V1][Doc] Update V1 support for `LLaVa-NeXT-Video`
 (#11798)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 590bea992d1fc..7777545b8b3c1 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -640,7 +640,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaOnevisionForConditionalGeneration`
   - LLaVA-Onevision
   - T + I<sup>+</sup> + V<sup>+</sup>

From 8f37be38ebfe0295a4925837c501c87149997a4d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 18:25:02 +0800
Subject: [PATCH 015/115] [Bugfix] Comprehensively test and fix LLaVA-NeXT
 feature size calculation (#11800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements-test.in                          |   1 +
 requirements-test.txt                         |   4 +
 .../processing/test_llava_next.py             | 129 +++++++++++++++---
 .../processing/test_llava_onevision.py        | 129 +++++++++++++++---
 vllm/model_executor/models/llava_next.py      |  37 ++---
 vllm/model_executor/models/llava_onevision.py |  42 +++---
 6 files changed, 253 insertions(+), 89 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index fb4179c3d8423..4b4dc376d1fa5 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
+pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
diff --git a/requirements-test.txt b/requirements-test.txt
index 3771577fe8ed0..f576e42afcbbf 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -48,6 +48,8 @@ botocore==1.35.57
     #   awscli
     #   boto3
     #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
 buildkite-test-collector==0.1.9
     # via -r requirements-test.in
 certifi==2024.8.30
@@ -342,6 +344,8 @@ pooch==1.8.2
     # via librosa
 portalocker==2.10.1
     # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements-test.in
 propcache==0.2.0
     # via yarl
 protobuf==5.28.3
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 37a6d334ee60c..9fa6a8a10a0f9 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,20 +20,69 @@ def processor_for_llava_next():
     return LlavaNextMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        # NOTE: There is a BOS token
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
+            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_next,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -37,22 +91,55 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_next(ctx)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_next,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_next(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 1
-    assert first_placeholder["length"] == (
-        len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index ed3e2db799be7..d4cdffa210b6d 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
     return LlavaOnevisionMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
+            processed_inputs["prompt_token_ids"]) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_onevision,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
-    properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_onevision(ctx)
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_onevision,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_onevision(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 0
-    assert first_placeholder["length"] == len(
-        processed_inputs["prompt_token_ids"]) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 258352416d4a7..815456dac2a2f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,7 +2,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
@@ -74,7 +73,7 @@ def _get_hf_config(self) -> LlavaNextLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
     def _get_num_image_tokens(
         self,
         *,
@@ -111,7 +110,7 @@ def _get_num_image_tokens(
 
         return unpadded_feature_size + newline_feature_size + base_feature_size
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     def _get_num_unpadded_features(
         self,
         *,
@@ -121,29 +120,23 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 62dae74e377be..b5e3edba1f01c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, LlavaOnevisionConfig,
@@ -98,6 +97,8 @@ def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
+    # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
         self,
         *,
@@ -107,35 +108,28 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
-        ratio = math.sqrt(current_height_f * current_width_f /
-                          (9 * npatches**2))
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
         if ratio > 1.1:
-            height_factor = int(current_height_f // ratio)
-            width_factor = int(current_width_f // ratio)
+            height_factor = int(current_height // ratio)
+            width_factor = int(current_width // ratio)
             unpadded_features = height_factor * width_factor
             newline_features = height_factor
 

From 869e829b853cc35747c5e4bc9d773a4cff704d12 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 18:41:17 +0800
Subject: [PATCH 016/115] [doc] add doc to explain how to use uv (#11773)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../getting_started/installation/gpu-cuda.md  | 67 ++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 7ea10bb8b59ff..295555b6c41f0 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
 
 ## Install released versions
 
-You can install vLLM using pip:
+### Create a new Python environment
+
+You can create a new Python environment using `conda`:
 
 ```console
 $ # (Recommended) Create a new conda environment.
 $ conda create -n myenv python=3.12 -y
 $ conda activate myenv
-
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm
 ```
 
 ```{note}
-Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+```
+
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
+
+### Install vLLM
+
+You can install vLLM using either `pip` or `uv pip`:
+
+```console
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm # If you are using pip.
+$ uv pip install vllm # If you are using uv.
 ```
 
-````{note}
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
 
 ```console
 $ # Install vLLM with CUDA 11.8.
@@ -38,22 +57,19 @@ $ export PYTHON_VERSION=310
 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
-In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-````
-
 (install-the-latest-code)=
 
 ## Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+
+### Install the latest code using `pip`
 
 ```console
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
@@ -62,6 +78,27 @@ $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
+Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+
+### Install the latest code using `uv`
+
+Another way to install the latest code is to use `uv`:
+
+```console
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+```
+
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+### Install the latest code using `docker`
+
 Another way to access the latest code is to use the docker images:
 
 ```console

From 2de197bdd4b82a004ff99806d054dce1d93b3ced Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 03:47:36 -0800
Subject: [PATCH 017/115] [V1] Support audio language models on V1 (#11733)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md    |  4 ++--
 vllm/model_executor/models/qwen2_audio.py |  9 +++++---
 vllm/model_executor/models/ultravox.py    | 28 +++++++++++++++++------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7777545b8b3c1..8c5f6836d6aa8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/Qwen2-Audio-7B-Instruct`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Qwen2VLForConditionalGeneration`
   - Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
@@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `fixie-ai/ultravox-v0_3`
   -
   - ✅︎
-  -
+  - ✅︎
 ```
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 576b01776e5de..7012ddc66cd9c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -335,13 +335,16 @@ def _process_audio_input(self,
         selected_audio_feature = audio_outputs.last_hidden_state
         audio_features = self.multi_modal_projector(selected_audio_feature)
         num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
         audio_features_mask = torch.arange(max_audio_tokens).expand(
-            num_audios, max_audio_tokens
-        ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
+            num_audios, max_audio_tokens).to(
+                audio_output_lengths.device) < audio_output_lengths
         masked_audio_features = audio_features[audio_features_mask].view(
             -1, embed_dim)
 
-        return masked_audio_features
+        # Split to tuple of embeddings for individual audio input.
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.flatten().tolist())
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ba823acecbb56..ecafd157b1d61 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-
 import math
 from functools import cached_property
 from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
@@ -14,6 +13,7 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
+from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
@@ -35,8 +35,11 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
+_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -64,7 +67,14 @@ def _get_hf_processor(
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> ProcessorMixin:
-        return self.ctx.get_hf_processor()
+        hf_processor = self.ctx.get_hf_processor()
+
+        # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
+        # placeholder that will cause confusion with the actual end of turn
+        # token, thus we override placeholder with a reserved special
+        # token.
+        hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
+        return hf_processor
 
     def _get_feature_extractor(
         self,
@@ -465,11 +475,15 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
 
-            # TODO(ywang96): use merge_multimodal_embeddings after
-            # v0 is deprecated
-            merge_multimodal_embeddings_from_map(
-                inputs_embeds, multimodal_embeddings,
-                attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            # TODO(ywang96): remove this block after v0 is deprecated.
+            if not envs.VLLM_USE_V1:
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, multimodal_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            else:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids, inputs_embeds, multimodal_embeddings,
+                    _AUDIO_PLACEHOLDER_TOKEN)
         return inputs_embeds
 
     def forward(self,

From d9fa1c05ad7149a43051a283d0cbeeb65bf6b4a3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 21:42:58 +0800
Subject: [PATCH 018/115] [doc] update how pip can install nightly wheels
 (#11806)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 295555b6c41f0..1cd513177bf0d 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -66,19 +66,19 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ### Install the latest code using `pip`
 
 ```console
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+`--pre` is required for `pip` to consider pre-released versions.
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
 ### Install the latest code using `uv`
 
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
+This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 

From c0efe92d8b9ef968a5b796fd7d6ebc426d78e726 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 21:50:58 +0800
Subject: [PATCH 019/115] [Doc] Add note to `gte-Qwen2` models (#11808)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8c5f6836d6aa8..3ba34c77205e5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -430,6 +430,9 @@ You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask
 
 On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
+
+Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
+loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using

From 869579a702cb086cca6bd6ec4500f954a9adec1c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 01:04:28 +0800
Subject: [PATCH 020/115] [optimization] remove python function call for custom
 op (#11750)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                             |  4 ----
 vllm/model_executor/layers/activation.py        | 17 +++++++++++------
 .../layers/fused_moe/fused_marlin_moe.py        |  4 ++--
 .../layers/fused_moe/fused_moe.py               |  3 ++-
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index eb2f69df42624..afb350591e562 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -35,10 +35,6 @@ def register_fake(fn):
 
 
 # activation ops
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.silu_and_mul(out, x)
-
-
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.gelu_and_mul(out, x)
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 34d65ed51ef3f..46d4670bfe4f9 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -10,6 +10,7 @@
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import LazyDict
 
 
@@ -58,27 +59,31 @@ class SiluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike():
+            self.op = torch.ops._C.silu_and_mul
+        elif current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+            self.op = ipex.llm.functional.silu_and_mul
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4741d69de11ac..87993267c05b5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
@@ -301,7 +300,8 @@ def fused_marlin_moe(
         False,
     )
 
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    torch.ops._C.silu_and_mul(intermediate_cache2,
+                              intermediate_cache1.view(-1, 2 * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4101facbe7874..1bb6bc753d37c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -753,7 +753,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 use_int8_w8a16=use_int8_w8a16,
                                 block_shape=block_shape)
 
-        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        torch.ops._C.silu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, N))
 
         invoke_fused_moe_kernel(intermediate_cache2,
                                 w2,

From c994223d569221652643e897d8402b835ead411d Mon Sep 17 00:00:00 2001
From: jiangjiadi <34134495+jiangjiadi@users.noreply.github.com>
Date: Wed, 8 Jan 2025 02:36:34 +0800
Subject: [PATCH 021/115] [Bugfix] update the prefix for qwen2 (#11795)

Co-authored-by: jiadi.jjd <jiadi.jjd@antgroup.com>
---
 vllm/model_executor/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 01745b5fd53e1..d20fb150f7e39 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -298,7 +298,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Qwen2DecoderLayer(config=config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
-                                             prefix=f"{prefix}.layers"),
+                                             prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 

From 973f5dc581c35a9c5b9176116e2f42f3f97d0d01 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:19:12 -0800
Subject: [PATCH 022/115] [Doc]Add documentation for using EAGLE in vLLM
 (#11417)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
---
 docs/source/features/spec_decode.md | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index bc8a0aa14dc5a..903acadb71426 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub:
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
 - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
 
+## Speculating using EAGLE based draft models
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="path/to/modified/eagle/model",
+    speculative_draft_tensor_parallel_size=1,
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+```
+
+A few important things to consider when using the EAGLE based draft models:
+
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
+   used directly with vLLM due to differences in the expected layer names and model definition.
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   to convert them. Note that this script does not modify the model's weights.
+
+   In the above example, use the script to first convert
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   and then use the converted checkpoint as the draft model in vLLM.
+
+2. The EAGLE based draft models need to be run without tensor parallelism
+   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   it is possible to run the main model using tensor parallelism (see example above).
+
+3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
+   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
+   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+
+
+A variety of EAGLE draft models are available on the Hugging Face hub:
+
+| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
+|---------------------------------------------------------------------|-------------------------------------------|--------------------|
+| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
+| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
+| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
+| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
+| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
+| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
+| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
+| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
+| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
+| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
+| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
+
+
 ## Lossless guarantees of Speculative Decoding
 
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of

From a4e2b268568b335d8fe37f8eaaa894cec3ba9397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 8 Jan 2025 08:15:50 +0800
Subject: [PATCH 023/115] [Bugfix] Significant performance drop on CPUs with
 --num-scheduler-steps > 1 (#11794)

---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e94664308cf8d..0850bab6bb7e1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1157,6 +1157,12 @@ def create_engine_config(self,
             if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
                 raise ValueError("Multi-Step Chunked-Prefill is not supported "
                                  "for pipeline-parallel-size > 1")
+            from vllm.platforms import current_platform
+            if current_platform.is_cpu():
+                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
+                               "currently not supported for CPUs and has been "
+                               "disabled.")
+                self.num_scheduler_steps = 1
 
         # make sure num_lookahead_slots is set the higher value depending on
         # if we are using speculative decoding or multi-step

From 5950f555a1d2ce19c30efb24abe03737320d05c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 01:20:12 +0000
Subject: [PATCH 024/115] [Doc] Group examples into categories (#11782)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .gitignore                                    |   5 +-
 docs/Makefile                                 |   4 +
 docs/requirements-docs.txt                    |   1 +
 docs/source/conf.py                           |   4 +
 docs/source/generate_examples.py              | 264 +++++++++++++++---
 .../examples/examples_index.template.md       |   8 -
 examples/fp8/README.md                        |   6 +-
 .../Otel.md                                   |   0
 .../dummy_client.py                           |   0
 .../README.md                                 |  10 +-
 .../docker-compose.yaml                       |   0
 .../grafana.json                              |   0
 .../prometheus.yaml                           |   0
 13 files changed, 240 insertions(+), 62 deletions(-)
 delete mode 100644 docs/source/getting_started/examples/examples_index.template.md
 rename examples/{production_monitoring => opentelemetry}/Otel.md (100%)
 rename examples/{production_monitoring => opentelemetry}/dummy_client.py (100%)
 rename examples/{production_monitoring => prometheus_grafana}/README.md (95%)
 rename examples/{production_monitoring => prometheus_grafana}/docker-compose.yaml (100%)
 rename examples/{production_monitoring => prometheus_grafana}/grafana.json (100%)
 rename examples/{production_monitoring => prometheus_grafana}/prometheus.yaml (100%)

diff --git a/.gitignore b/.gitignore
index bb7e4d5b244a8..89dab8f13bab1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,10 +79,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
-!**/*.template.rst
-docs/source/getting_started/examples/*.md
-!**/*.template.md
+docs/source/getting_started/examples/
 
 # PyBuilder
 .pybuilder/
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf1020d5..5b801f79d1f26 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,7 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf "$(SOURCEDIR)/getting_started/examples"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 25a700033cc9e..64cf6ef8fc19d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 71394c5302a39..1ce11fe057071 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,10 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_togglebutton",
+]
+myst_enable_extensions = [
+    "colon_fence",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aef32f7559f74..32bb86c469c78 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,54 +1,234 @@
+import itertools
 import re
+from dataclasses import dataclass, field
 from pathlib import Path
 
+ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
+ROOT_DIR_RELATIVE = '../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
+
 
 def fix_case(text: str) -> str:
-    subs = [
-        ("api", "API"),
-        ("llm", "LLM"),
-        ("vllm", "vLLM"),
-        ("openai", "OpenAI"),
-        ("multilora", "MultiLoRA"),
-    ]
-    for sub in subs:
-        text = re.sub(*sub, text, flags=re.IGNORECASE)
+    subs = {
+        "api": "API",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
     return text
 
 
-def generate_title(filename: str) -> str:
-    # Turn filename into a title
-    title = filename.replace("_", " ").title()
-    # Handle acronyms and names
-    title = fix_case(title)
-    return f"# {title}"
+@dataclass
+class Index:
+    """
+    Index class to generate a structured document index.
+
+    Attributes:
+        path (Path): The path save the index file to.
+        title (str): The title of the index.
+        description (str): A brief description of the index.
+        caption (str): An optional caption for the table of contents.
+        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
+        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
+
+    Methods:
+        generate() -> str:
+            Generates the index content as a string in the specified format.
+    """ # noqa: E501
+    path: Path
+    title: str
+    description: str
+    caption: str
+    maxdepth: int = 1
+    documents: list[str] = field(default_factory=list)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n{self.description}\n\n"
+        content += "```{toctree}\n"
+        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
+        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        return content
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): List of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        # Convert the path to a relative path from __file__
+        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
+            ROOT_DIR)
+
+        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        if self.main_file.suffix == ".py":
+            content += f"# {self.title}\n\n"
+        include = "include" if self.main_file.suffix == ".md" else \
+            "literalinclude"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in self.other_files:
+            include = "include" if file.suffix == ".md" else "literalinclude"
+            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
+            content += ":class: dropdown\n\n"
+            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
+            content += ":::\n\n"
+
+        return content
 
 
 def generate_examples():
-    root_dir = Path(__file__).parent.parent.parent.resolve()
-
-    # Source paths
-    script_dir = root_dir / "examples"
-    script_paths = sorted(script_dir.glob("*.py"))
-
-    # Destination paths
-    doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
-
-    # Generate the example docs for each example script
-    for script_path, doc_path in zip(script_paths, doc_paths):
-        # Make script_path relative to doc_path and call it include_path
-        include_path = '../../../..' / script_path.relative_to(root_dir)
-        content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
-                   f"```{{literalinclude}} {include_path}\n"
-                   ":language: python\n"
-                   ":linenos:\n```")
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    # Create empty indices
+    examples_index = Index(
+        path=EXAMPLE_DOC_DIR / "examples_index.md",
+        title="Examples",
+        description=
+        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
+        caption="Examples",
+        maxdepth=1)  # TODO change to 2 when examples start being categorised
+    category_indices = {
+        "offline_inference":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
+            description=
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            caption="Examples",
+        ),
+        "online_serving":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
+            title="Online Serving",
+            description=
+            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
+            caption="Examples",
+        ),
+        "other":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
+            description=
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            caption="Examples",
+        ),
+    }
+
+    examples = []
+    # Find categorised examples
+    for category in category_indices:
+        category_dir = EXAMPLE_DIR / category
+        py = category_dir.glob("*.py")
+        md = category_dir.glob("*.md")
+        for path in itertools.chain(py, md):
+            examples.append(Example(path, category))
+        # Find examples in subdirectories
+        for path in category_dir.glob("*/*.md"):
+            examples.append(Example(path.parent, category))
+    # Find uncategorised examples
+    py = EXAMPLE_DIR.glob("*.py")
+    md = EXAMPLE_DIR.glob("*.md")
+    for path in itertools.chain(py, md):
+        examples.append(Example(path))
+    # Find examples in subdirectories
+    for path in EXAMPLE_DIR.glob("*/*.md"):
+        # Skip categorised examples
+        if path.parent.name in category_indices:
+            continue
+        examples.append(Example(path.parent))
+
+    # Generate the example documentation
+    for example in examples:
+        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
-            f.write(content)
-
-    # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.md") as f:
-        examples_index = f.read()
-    with open(doc_dir / "examples_index.md", "w+") as f:
-        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
+            f.write(example.generate())
+        # Add the example to the appropriate index
+        index = category_indices.get(example.category, examples_index)
+        index.documents.append(example.path.stem)
+
+    # Generate the index files
+    for category_index in category_indices.values():
+        if category_index.documents:
+            examples_index.documents.insert(0, category_index.path.name)
+            with open(category_index.path, "w+") as f:
+                f.write(category_index.generate())
+
+    with open(examples_index.path, "w+") as f:
+        f.write(examples_index.generate())
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
deleted file mode 100644
index de7a91c0ffa48..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Examples
-
-```{toctree}
-:maxdepth: 1
-:caption: Scripts
-
-%EXAMPLE_DOCS%
-```
\ No newline at end of file
diff --git a/examples/fp8/README.md b/examples/fp8/README.md
index 181c36558fcff..5492872cae93a 100644
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -56,7 +56,7 @@ python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> -
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
-```python
+```
 # prerequisites:
 # -  LLaMa 2 kv_cache_scales.json file
 
@@ -90,7 +90,7 @@ optional arguments:
   --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
   --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
 ```
-```
 Example:
+```console
 python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
-```python
+```
diff --git a/examples/production_monitoring/Otel.md b/examples/opentelemetry/Otel.md
similarity index 100%
rename from examples/production_monitoring/Otel.md
rename to examples/opentelemetry/Otel.md
diff --git a/examples/production_monitoring/dummy_client.py b/examples/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/production_monitoring/dummy_client.py
rename to examples/opentelemetry/dummy_client.py
diff --git a/examples/production_monitoring/README.md b/examples/prometheus_grafana/README.md
similarity index 95%
rename from examples/production_monitoring/README.md
rename to examples/prometheus_grafana/README.md
index 807c0470e7b30..c49e5306a1cb4 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/prometheus_grafana/README.md
@@ -1,4 +1,4 @@
-# vLLM + Prometheus/Grafana 
+# Prometheus and Grafana 
 
 This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
 
@@ -6,7 +6,7 @@ Install:
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 
-### Launch
+## Launch
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
@@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \
 
 Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
 
-### Grafana Dashboard
+## Grafana Dashboard
 
 Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
 
-#### Add Prometheus Data Source
+### Add Prometheus Data Source
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
 
@@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
-#### Import Dashboard 
+### Import Dashboard 
 
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
 
diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/production_monitoring/docker-compose.yaml
rename to examples/prometheus_grafana/docker-compose.yaml
diff --git a/examples/production_monitoring/grafana.json b/examples/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/production_monitoring/grafana.json
rename to examples/prometheus_grafana/grafana.json
diff --git a/examples/production_monitoring/prometheus.yaml b/examples/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/production_monitoring/prometheus.yaml
rename to examples/prometheus_grafana/prometheus.yaml

From 91445c7bc8000a6f6f1efed0882076d7001be968 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:17:16 +0800
Subject: [PATCH 025/115] [Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 ...e_inference_vision_language_multi_image.py | 41 ++++++++++++++++---
 vllm/model_executor/models/llava.py           |  6 +++
 vllm/model_executor/models/pixtral.py         |  2 +-
 vllm/model_executor/models/utils.py           |  9 ++++
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 6af8d7768e75d..cf2e90a325c6a 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -23,7 +23,7 @@
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[str]]
+    stop_token_ids: Optional[List[int]]
     image_data: List[Image]
     chat_template: Optional[str]
 
@@ -44,12 +44,14 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
+        chat_template=None,
+    )
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -166,7 +168,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"{placeholders}<|begin_of_text|>{question}"
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -209,6 +212,31 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "mistral-community/pixtral-12b"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -244,7 +272,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_qwen_vl_chat(question: str,
+                      image_urls: List[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
@@ -274,6 +303,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
 
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -348,7 +378,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
-    "qwen_vl_chat": load_qwenvl_chat,
+    "pixtral_hf": load_pixtral_hf,
+    "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
 }
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4299af8cd03a2..305f1364dba23 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -546,6 +546,12 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if self.config.vision_config.model_type == "pixtral":
+                return LlavaImagePixelInputs(
+                    type="pixel_values",
+                    data=flatten_bn(pixel_values),
+                )
+
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9e1d38512c0b4..b74bb3c8a3f88 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -774,7 +774,7 @@ def get_num_image_tokens(
     ) -> int:
         return get_pixtral_hf_image_feature_size(
             image_size=self.vision_config.image_size,
-            patch_size=self.get_image_size(),
+            patch_size=self.vision_config.patch_size,
         )
 
     def get_max_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 31017f16d3c97..4ed3b237ae0e2 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -281,6 +281,15 @@ def flatten_bn(
     ...
 
 
+@overload
+def flatten_bn(
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    ...
+
+
 def flatten_bn(
     x: Union[List[torch.Tensor], torch.Tensor],
     *,

From 4d29e91be84d27ca313d657eee92c067439a4c23 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:57:04 -0600
Subject: [PATCH 026/115] [Misc] sort torch profiler table by kernel timing
 (#11813)

---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 0a14aedd5feba..e669ce4db299d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,7 +52,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
                 llm.generate(dummy_prompts,
                              sampling_params=sampling_params,
                              use_tqdm=False)
-            print(p.key_averages())
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_prompts,

From dc71af0a71f347badcd917810440fad136e73ba6 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Wed, 8 Jan 2025 12:09:25 +0800
Subject: [PATCH 027/115] =?UTF-8?q?Remove=20the=20duplicate=20imports=20of?=
 =?UTF-8?q?=20MultiModalKwargs=20and=20PlaceholderRange=E2=80=A6=20(#11824?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/v1/core/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index baaf3329dc79f..b26716f5c02e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -5,8 +5,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager

From b640b19cc0babe256c5455befe95340f951763d9 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 8 Jan 2025 10:35:37 +0530
Subject: [PATCH 028/115] Fixed docker build for ppc64le (#11518)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 971248577983f..d3cd1c7b313bc 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
-# These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \

From f4923cb8bce7d9c3038ad6c597ae1ff3ed90fe93 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 09:08:30 +0400
Subject: [PATCH 029/115] [OpenVINO] Fixed Docker.openvino build (#11732)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 Dockerfile.openvino | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 8bd188ffde408..32bcbfa9cc168 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
+RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend

From f645eb69545672d394e9e9e0ce46c725504fd2a0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 8 Jan 2025 13:08:48 +0800
Subject: [PATCH 030/115] [Bugfix] Add checks for LoRA and CPU offload (#11810)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 8b824a1fca511..a9b6d6b19127f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2051,6 +2051,11 @@ def __post_init__(self):
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                 f"max_loras ({self.max_loras})")
 
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        # TODO LoRA supports CPU offload.
+        if cache_config.cpu_offload_gb > 0:
+            raise ValueError("CPU offload is not supported with LoRA yet.")
+
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
             self.lora_dtype = model_config.dtype
@@ -3138,6 +3143,7 @@ def __post_init__(self):
             self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config:
+            self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)

From 259abd8953a8fea9abf3c4e66aa7c51391fa5b64 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 21:16:08 -0800
Subject: [PATCH 031/115] [Docs] reorganize sponsorship page (#11639)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 15 ++++++++++-----
 docs/source/community/sponsors.md | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 652268ec29cac..8e85b460363fc 100644
--- a/README.md
+++ b/README.md
@@ -90,28 +90,33 @@ vLLM is a community project. Our compute resources for development and testing a
 
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index c6f83b3a92ca0..3d5a57baefbde 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -5,26 +5,32 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with README.md. -->
 
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

From ef68eb28d8d45be6e0defe82245e16be9362e375 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 13:40:09 +0800
Subject: [PATCH 032/115] [Bug] Fix pickling of `ModelConfig` when RunAI Model
 Streamer is used (#11825)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a9b6d6b19127f..44426489f686a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -381,16 +381,16 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
         """
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
-                self.s3_model = S3Model()
-                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model = S3Model()
+                s3_model.pull_files(model, allow_pattern=["*config.json"])
                 self.model_weights = self.model
-                self.model = self.s3_model.dir
+                self.model = s3_model.dir
 
             if is_s3(tokenizer):
-                self.s3_tokenizer = S3Model()
-                self.s3_tokenizer.pull_files(
+                s3_tokenizer = S3Model()
+                s3_tokenizer.pull_files(
                     model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
-                self.tokenizer = self.s3_tokenizer.dir
+                self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]

From 889e662eae19fe8f30469883c6854ee4df4315a9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:03 +0800
Subject: [PATCH 033/115] [misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/test_utils.py                           | 19 +++++-
 .../vllm_test_utils/__init__.py               |  3 +-
 .../vllm_test_utils/monitor.py                | 68 +++++++++++++++++++
 vllm/utils.py                                 | 12 ++--
 4 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 32a6b0aed66aa..0285b00d73be1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from vllm_test_utils import monitor
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
                         get_open_port, memory_profiling, merge_async_iterators,
@@ -289,8 +290,16 @@ def test_memory_profiling():
 
     weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
 
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
     with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
         del spike
@@ -298,7 +307,15 @@ def test_memory_profiling():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
     # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by PyTorch caching allocator,
+    # we cannot control PyTorch's behavior of its internal buffers,
+    # which causes a small error (<10 MiB in practice)
     non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
     torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index bf0b62a5b75e3..6505c81546bb0 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,5 +4,6 @@
 """
 
 from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor
 
-__all__ = ["blame", "BlameResult"]
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
new file mode 100644
index 0000000000000..a237f53a75d18
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -0,0 +1,68 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
diff --git a/vllm/utils.py b/vllm/utils.py
index 63057153f851d..2660b53d7bfb0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1742,10 +1742,10 @@ class MemorySnapshot:
     timestamp: float = 0.0
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-        self.torch_memory_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -1822,10 +1822,10 @@ def memory_profiling(
 
     The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
 
-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
 
     (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
     """ # noqa
     torch.cuda.reset_peak_memory_stats()
 

From ad9f1aa6796297a00456e715043f3eaad55bed53 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:49 +0800
Subject: [PATCH 034/115] [doc] update wheels url (#11830)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 4 ++--
 python_only_dev.py                                   | 2 +-
 setup.py                                             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 1cd513177bf0d..419b8163fc034 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -75,7 +75,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
diff --git a/python_only_dev.py b/python_only_dev.py
index f70b4984025b3..7d95ac96e6e4b 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -7,7 +7,7 @@
 or
 
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install -e .
 """ # noqa
 
diff --git a/setup.py b/setup.py
index ba6953dbdc174..ef9f4e579e84d 100644
--- a/setup.py
+++ b/setup.py
@@ -252,7 +252,7 @@ def run(self):
 
 class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
     def run(self) -> None:
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",

From a1b2b8606e75ab8fbc066e7f0fae20c1e60244ca Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 23:05:46 -0800
Subject: [PATCH 035/115] [Docs] Update sponsor name: 'Novita' to 'Novita AI'
 (#11833)

---
 README.md                         | 2 +-
 docs/source/community/sponsors.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e85b460363fc..1f82229f39537 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 3d5a57baefbde..9d2af4c13b088 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,7 +22,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox

From cfd3219f5881e2abea1f7c9d2866ed1838c5057b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 8 Jan 2025 05:35:49 -0300
Subject: [PATCH 036/115] [Hardware][Apple] Native support for macOS Apple
 Silicon (#11696)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 cmake/cpu_extension.cmake                     | 61 ++++++++++++++-----
 csrc/cpu/cpu_types_arm.hpp                    | 61 ++++++++++++++++++-
 csrc/cpu/utils.cpp                            | 23 +++++--
 .../getting_started/installation/cpu-apple.md | 51 ++++++++++++++++
 .../getting_started/installation/cpu-arm.md   |  4 +-
 .../getting_started/installation/index.md     |  1 +
 requirements-cpu.txt                          |  6 +-
 setup.py                                      |  9 ++-
 vllm/config.py                                | 12 ++++
 vllm/entrypoints/openai/api_server.py         |  3 +
 vllm/utils.py                                 |  7 +++
 11 files changed, 209 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/getting_started/installation/cpu-apple.md

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 68f7ca1af05ad..714abca2a5ff7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
         "-mf16c"
     )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")
 
-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
 
-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()
 
+
 function (find_isa CPUINFO TARGET OUT)
     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
     if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
 
 is_avx512_disabled(AVX512_DISABLED)
 
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
     endif()
     list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
 
 #
 # _C extension
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 73e0f8cb2e0fb..ae062a5b86892 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
                 vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
             }
         }
+
+        // Note: below is the unrolled version of the following code:
+        // 
+        // for (int i = 0; i < remainder; ++i) {
+        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
+        //          vgetq_lane_f16(temp, i);
+        // }
+        // 
+        // For macOS build (Clang), the arm/neon intrinsics function 
+        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
+        // time. 
         
         if (remainder > 0) {
             float16x8_t temp = reg.val[full_blocks];
-            for (int i = 0; i < remainder; ++i) {
-                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+            switch (remainder)
+            {
+            case 1:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              break;
+            case 2:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              break;
+            case 3:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              break;
+            case 4:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              break;
+            case 5:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              break;
+            case 6:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              break;
+            case 7:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+              break;
+            
+            default:
+              break;
             }
         }
     }
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 1138a55df2f05..42a1c1d924bac 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -1,10 +1,22 @@
-#include <numa.h>
-#include <unistd.h>
-#include <string>
-#include <sched.h>
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
 
 #include "cpu_types.hpp"
 
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   omp_lock_t writelock;
   omp_init_lock(&writelock);
 
-#pragma omp parallel for schedule(static, 1)
+  #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif
\ No newline at end of file
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
new file mode 100644
index 0000000000000..b55e4384d064d
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -0,0 +1,51 @@
+(installation-apple)=
+
+# Installation for macOS
+
+vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+## Requirements
+
+- **Operating System**: `macOS Sonoma` or later
+- **SDK** `XCode 15.4` or later with Command Line Tools
+- **Compilers**: `Apple Clang >= 15.0.0`
+
+<!-- (arm-backend-quick-start-dockerfile)= -->
+
+## Build and installation
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-cpu.txt
+$ pip install -e . 
+```
+
+```{note}
+On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+```
+
+
+
+## Troubleshooting
+
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+```
+[...] fatal error: 'map' file not found
+          1 | #include <map>
+            |          ^~~~~
+      1 error generated.
+      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+[...] fatal error: 'cstddef' file not found
+         10 | #include <cstddef>
+            |          ^~~~~~~~~
+      1 error generated.
+```
+
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
index a46e2c010600d..e199073ed721f 100644
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
+- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 83de1aff409b2..0ebadca2ccec9 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -11,6 +11,7 @@ gpu-cuda
 gpu-rocm
 cpu-x86
 cpu-arm
+cpu-apple
 hpu-gaudi
 tpu
 xpu
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index e62f313297762..056fbf5a7adec 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index ef9f4e579e84d..b6c1f5bc8ac3f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
diff --git a/vllm/config.py b/vllm/config.py
index 44426489f686a..535cbe97a311a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import sys
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
                     "supported for POWERPC.")
                 torch_dtype = torch.bfloat16
 
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 047f699e4f277..bc1471e1f534d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import re
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -805,6 +806,8 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            # Workaround to work on macOS
+            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 2660b53d7bfb0..c09cae70e9af8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -524,6 +524,13 @@ def get_open_port() -> int:
 
 
 def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
     for conn in psutil.net_connections():
         if conn.laddr.port == port:
             try:

From f12141170a95ad866b3c55762623bc718994e1d7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 18:46:43 +0800
Subject: [PATCH 037/115] [torch.compile] consider relevant code in compilation
 cache (#11614)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py   | 70 ++++++++++++++++++++++++++++++----
 vllm/compilation/decorators.py | 28 +++++++++++++-
 vllm/config.py                 | 29 ++------------
 vllm/sequence.py               |  7 ++++
 4 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a8dd628b9cd6f..87655530cead4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -145,6 +145,7 @@ def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
+                  vllm_backend: "VllmBackend",
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
@@ -176,7 +177,7 @@ def wrap_inductor(graph: fx.GraphModule,
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
 
-    cache_data = compilation_config.inductor_hash_cache
+    cache_data = vllm_backend.inductor_hash_cache
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
@@ -196,7 +197,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 hash_str, example_inputs, True, False)
             assert inductor_compiled_graph is not None, (
                 "Inductor cache lookup failed. Please remove"
-                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
 
         # Inductor calling convention (function signature):
@@ -354,7 +355,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: List[str], vllm_config: VllmConfig,
-                 graph_pool):
+                 graph_pool, vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
@@ -362,6 +363,7 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.vllm_config = vllm_config
+        self.vllm_backend = vllm_backend
 
     def run(self, *args):
         fake_args = [
@@ -389,6 +391,7 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
@@ -397,7 +400,7 @@ def call_module(self, target: torch.fx.node.Target,
             self.module.__dict__[target] = PiecewiseBackend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_general_shape)
+                compiled_graph_for_general_shape, self.vllm_backend)
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
@@ -430,6 +433,7 @@ class VllmBackend:
     post_grad_passes: Sequence[Callable]
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
+    inductor_hash_cache: InductorHashCache
 
     def __init__(
         self,
@@ -472,6 +476,53 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        if not self.compilation_config.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+
+            # 1. factors come from the vllm_config (it mainly summarizes how the
+            #    model is created)
+            vllm_config = self.vllm_config
+            config_hash = vllm_config.compute_hash()
+
+            # 2. factors come from the code files that are traced by Dynamo (
+            #    it mainly summarizes how the model is used in forward pass)
+            forward_code_files = list(
+                sorted(self.compilation_config.traced_files))
+            self.compilation_config.traced_files.clear()
+            logger.debug(
+                "Traced files (to be considered for compilation cache):\n%s",
+                "\n".join(forward_code_files))
+            hash_content = []
+            for filepath in forward_code_files:
+                hash_content.append(filepath)
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            import hashlib
+            code_hash = hashlib.md5(
+                "\n".join(hash_content).encode()).hexdigest()
+
+            # combine the two hashes to generate the cache dir
+            hash_key = hashlib.md5(
+                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+        else:
+            cache_dir = self.compilation_config.cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+
+        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+            cache_dir, disabled=disabled)
+        if disabled:
+            logger.info("vLLM's torch.compile cache is disabled.")
+        else:
+            logger.info("Using cache directory: %s for vLLM's torch.compile",
+                        cache_dir)
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -507,8 +558,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config,
-                                    self.graph_pool).run(*example_inputs)
+                                    self.vllm_config, self.graph_pool,
+                                    self).run(*example_inputs)
 
         self._called = True
 
@@ -577,7 +628,8 @@ class PiecewiseBackend:
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                  graph_pool: Any, piecewise_compile_index: int,
                  total_piecewise_compiles: int, sym_shape_indices: List[int],
-                 compiled_graph_for_general_shape: Callable):
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -597,6 +649,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
 
         self.is_first_graph = piecewise_compile_index == 0
         self.is_last_graph = (
@@ -634,7 +687,7 @@ def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
-            self.compilation_config.inductor_hash_cache.save_to_file()
+            self.vllm_backend.inductor_hash_cache.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
@@ -662,6 +715,7 @@ def __call__(self, *args) -> Any:
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 805a217ee6ca1..10513111ea7f1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,8 +1,10 @@
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
+from unittest.mock import patch
 
 import torch
 import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
@@ -196,7 +198,31 @@ def __call__(self, *args, **kwargs):
             # we need to control all the compilation of the model.
             torch._dynamo.eval_frame.remove_from_cache(
                 self.original_code_object)
-            return self.compiled_callable(*args, **kwargs)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
 
         # usually, capturing the model once is enough, and then we can
         # dispatch to the compiled code directly, without going through
diff --git a/vllm/config.py b/vllm/config.py
index 535cbe97a311a..6dabeb3861af2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,7 +3,6 @@
 import enum
 import hashlib
 import json
-import os
 import sys
 import warnings
 from contextlib import contextmanager
@@ -2778,9 +2777,8 @@ def model_post_init(self, __context: Any) -> None:
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
+    traced_files: Set[str] = PrivateAttr
     compilation_time: float = PrivateAttr
-    # should be InductorHashCache, but Pydantic does not support it
-    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
@@ -2818,6 +2816,7 @@ def __repr__(self) -> str:
             "compilation_time",
             "bs_to_padded_graph_size",
             "pass_config",
+            "traced_files",
         }
         return self.model_dump_json(exclude=exclude, exclude_unset=True)
 
@@ -2877,6 +2876,7 @@ def model_post_init(self, __context: Any) -> None:
 
         self.enabled_custom_ops = Counter()
         self.disabled_custom_ops = Counter()
+        self.traced_files = set()
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
@@ -2899,29 +2899,6 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
 
-        if not self.cache_dir:
-            # no provided cache dir, generate one based on the known factors
-            # that affects the compilation. if none of the factors change,
-            # the cache dir will be the same so that we can reuse the compiled
-            # graph.
-            hash_key = vllm_config.compute_hash()
-            cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-            os.makedirs(cache_dir, exist_ok=True)
-            self.cache_dir = cache_dir
-
-            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
-            from vllm.compilation.backends import InductorHashCache
-            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-                self.cache_dir, disabled=disabled)
-            if disabled:
-                logger.info("vLLM's torch.compile cache is disabled.")
-            else:
-                logger.info(
-                    "Using cache directory: %s for vLLM's torch.compile",
-                    self.cache_dir)
-
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0157abbd2eed5..5857f656dfc10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1108,6 +1108,13 @@ class IntermediateTensors:
 
     tensors: Dict[str, torch.Tensor]
 
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
     def __getitem__(self, key: Union[str, slice]):
         if isinstance(key, str):
             return self.tensors[key]

From 2a0596bc480bb835dc05a30f5e708ecbfffbcd69 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 18:59:58 +0800
Subject: [PATCH 038/115] [VLM] Reorganize profiling/processing-related code
 (#11812)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  41 ++--
 .../processing/test_llava_onevision.py        |  41 ++--
 .../vision_language/processing/test_phi3v.py  |  24 +-
 .../processing/test_qwen2_vl.py               |  22 +-
 tests/multimodal/test_processing.py           |  52 ++---
 .../vllm_add_dummy_model/my_llava.py          |  10 +-
 vllm/inputs/preprocess.py                     |   2 +-
 vllm/inputs/registry.py                       |   4 +-
 vllm/model_executor/models/aria.py            |  47 ++--
 vllm/model_executor/models/blip2.py           |  39 ++--
 vllm/model_executor/models/chameleon.py       |  47 ++--
 vllm/model_executor/models/fuyu.py            |  80 +++----
 vllm/model_executor/models/llava.py           | 175 +++++++-------
 vllm/model_executor/models/llava_next.py      |  55 +++--
 .../model_executor/models/llava_next_video.py | 104 +++++----
 vllm/model_executor/models/llava_onevision.py | 115 +++++----
 vllm/model_executor/models/phi3v.py           |  83 +++----
 vllm/model_executor/models/qwen2_audio.py     |  49 ++--
 vllm/model_executor/models/qwen2_vl.py        | 113 +++++----
 vllm/model_executor/models/ultravox.py        |  46 ++--
 vllm/multimodal/processing.py                 | 219 +++++++-----------
 vllm/multimodal/profiling.py                  | 152 +++++++++---
 vllm/multimodal/registry.py                   |  73 +++++-
 23 files changed, 833 insertions(+), 760 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 9fa6a8a10a0f9..689d17be81889 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_next():
-    from vllm.model_executor.models.llava_next import (
-        LlavaNextMultiModalProcessor)
-    return LlavaNextMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -78,20 +71,17 @@ def _test_image_prompt_replacements(
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
                   "Comment this out to run it manually.")
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index d4cdffa210b6d..a033354f0e9b8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_onevision():
-    from vllm.model_executor.models.llava_onevision import (
-        LlavaOnevisionMultiModalProcessor)
-    return LlavaOnevisionMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -77,20 +70,17 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 249045b3c04ce..c5b77260c6544 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,21 +1,13 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
-    return Phi3VMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -29,7 +21,6 @@ def processor_for_phi3v():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_phi3v,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
@@ -37,21 +28,26 @@ def test_processor_override(
     num_imgs: int,
 ):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index b9ac887edf90f..0d54802f2b733 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,19 +1,12 @@
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-    return Qwen2VLMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -24,7 +17,6 @@ def processor_for_qwen2_vl():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_qwen2_vl,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
@@ -39,18 +31,20 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
     pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 75d878217b657..d98bd9736b65f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -10,12 +10,17 @@
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_mm_placeholders,
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
+                                        PromptReplacement,
+                                        find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
@@ -431,7 +436,7 @@ def test_find_replace_tokens(
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=6,
@@ -445,13 +450,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
@@ -459,7 +464,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=7,
@@ -472,13 +477,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
@@ -486,7 +491,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=6,
@@ -577,19 +582,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
-
-    processor = processor_factory(ctx, cache=None)
-    profiler = processor.profiling_info
+    profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    profiler.get_supported_mm_limits = mock_supported_mm_limits
+    processor.info.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -597,7 +598,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_mm_limits()
+        profiler.get_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
 
-    processor = processor_factory(ctx, cache=None)
-
     rng = np.random.RandomState(0)
     image = _rand_img(rng, min_wh=128, max_wh=256)
     if num_images == 0:
@@ -681,9 +678,9 @@ def _test_processing_cache_correctness(
         hf_overrides=hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
@@ -691,8 +688,9 @@ def _test_processing_cache_correctness(
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
-    baseline_processor = processor_factory(ctx, cache=None)
-    cached_processor = processor_factory(ctx, cache=cache)
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
 
     rng = np.random.RandomState(0)
 
@@ -724,7 +722,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
+        prompt = dummy_inputs.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 06dfebbb95527..ac64edfd4ec9d 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,13 +2,17 @@
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor)
+from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
+                                              LlavaForConditionalGeneration,
+                                              LlavaMultiModalProcessor,
+                                              LlavaProcessingInfo)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b362ee0cac328..6ddc1eb76f10d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_info_once, print_warning_once
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2d9d024e03e80..b22b3f1594f24 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -323,6 +323,7 @@ def dummy_data_for_profiling(
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.profiling import MultiModalProfiler
         from vllm.multimodal.utils import cached_get_tokenizer
 
         if mm_registry.has_processor(model_config):
@@ -331,7 +332,8 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-            dummy_data = processor.get_dummy_data(seq_len)
+            profiler = MultiModalProfiler(processor)
+            dummy_data = profiler.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2e649f10c0765..089062ab53fc3 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -23,10 +23,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -445,33 +445,33 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaProcessingMixin(ProcessingMixin):
+class AriaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config()
 
-    def _get_vision_config(self) -> AriaVisionConfig:
-        return self._get_hf_config().vision_config
-
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return max(hf_config.projector_patch_to_query_dict.values())
-
-
-class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+    def get_vision_config(self) -> AriaVisionConfig:
+        return self.get_hf_config().vision_config
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
+
+
+class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        vision_config = self._get_vision_config()
+        vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
@@ -483,7 +483,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
@@ -492,10 +492,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return AriaProfilingInfo(self.ctx)
+class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -513,10 +510,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -527,7 +524,9 @@ def _get_prompt_replacements(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
+                                        info=AriaProcessingInfo,
+                                        dummy_inputs=AriaDummyInputsBuilder)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index fd45783f167b4..7dfc0b687c6e3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -17,10 +17,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -397,30 +397,30 @@ def forward(
         return sequence_output
 
 
-class Blip2ProcessingMixin(ProcessingMixin):
+class Blip2ProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return hf_config.num_query_tokens
-
-
-class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return hf_config.num_query_tokens
+
+
+class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
@@ -439,10 +439,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Blip2ProfilingInfo(self.ctx)
+class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -460,7 +457,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -491,7 +488,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
+                                        info=Blip2ProcessingInfo,
+                                        dummy_inputs=Blip2DummyInputsBuilder)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 73ed73b61ebf9..acff926891bbe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,10 +30,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -49,33 +49,34 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonProcessingMixin(ProcessingMixin):
+class ChameleonProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
-    def _get_num_image_tokens(self) -> int:
-        processor = self._get_hf_processor()
-        return processor.image_seq_length
-
-
-class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        return processor.image_seq_length
+
+
+class ChameleonDummyInputsBuilder(
+        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        config = self._get_hf_config()
+        config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
@@ -93,11 +94,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return ChameleonProfilingInfo(self.ctx)
+class ChameleonMultiModalProcessor(
+        BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -112,7 +110,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -120,7 +118,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * self._get_num_image_tokens(),
+                    processor.image_token * self.info.get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -916,7 +914,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    ChameleonMultiModalProcessor,
+    info=ChameleonProcessingInfo,
+    dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index c937fcb0978b9..59af5f0b3ae98 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -33,11 +33,11 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -64,24 +64,38 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuProcessingMixin(ProcessingMixin):
+class FuyuProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(FuyuProcessor)
 
-    def _get_image_processor(self) -> FuyuImageProcessor:
-        return self._get_hf_processor().image_processor
+    def get_image_processor(self) -> FuyuImageProcessor:
+        return self.get_hf_processor().image_processor
 
-    def _get_image_feature_grid_size(
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
+
+    def get_image_feature_grid_size(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        image_processor = self._get_image_processor()
+        image_processor = self.get_image_processor()
         target_width = image_processor.size["width"]
         target_height = image_processor.size["height"]
 
@@ -97,34 +111,21 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-
-class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self._get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        image_processor = self._get_image_processor()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+
+class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
@@ -140,10 +141,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return FuyuProfilingInfo(self.ctx)
+class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -156,7 +154,7 @@ def _call_hf_processor(
             # Avoid warning from HF logger for text-only input
             # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
             # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
             prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -196,10 +194,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
@@ -207,7 +205,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_feature_grid_size(
+            ncols, nrows = self.info.get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
@@ -244,7 +242,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
+                                        info=FuyuProcessingInfo,
+                                        dummy_inputs=FuyuDummyInputsBuilder)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 305f1364dba23..8d94acf3b21d5 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -25,11 +25,11 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingCache,
-                                        ProcessingMixin, PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -105,34 +105,23 @@ class LlavaLikeProcessor(Protocol):
     image_token: Final[str]
 
 
-class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaLikeConfig:
+    def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
     @abstractmethod
-    def _get_hf_processor(self) -> LlavaLikeProcessor:
+    def get_hf_processor(self) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
 
     def _apply_feature_select_strategy(
         self,
@@ -147,28 +136,42 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
-class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
         )
 
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -176,9 +179,10 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -193,23 +197,13 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
 
-class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    # Copied from BaseMultiModalProcessor
-    @abstractmethod
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        raise NotImplementedError
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     # Copied from BaseMultiModalProcessor
     @abstractmethod
@@ -226,7 +220,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         def get_replacement(item_idx: int):
@@ -237,7 +231,7 @@ def get_replacement(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
@@ -253,10 +247,8 @@ def get_replacement(item_idx: int):
         ]
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaProfilingInfo(self.ctx)
+class LlavaMultiModalProcessor(
+        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -269,21 +261,14 @@ def _get_mm_fields_config(
         )
 
 
-class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
 
-class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return PixtralHFProfilingInfo(self.ctx)
+class PixtralHFMultiModalProcessor(
+        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -328,10 +313,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         image_break_token = processor.image_break_token
         image_end_token = processor.image_end_token
@@ -363,26 +348,40 @@ def get_replacement(item_idx: int):
         ]
 
 
+def _build_llava_or_pixtral_hf_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(LlavaConfig)
+
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFProcessingInfo(ctx)
+
+    return LlavaProcessingInfo(ctx)
+
+
 def _build_llava_or_pixtral_hf_processor(
-    ctx: InputProcessingContext,
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-
-    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+    if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
-            ctx,
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    if isinstance(info, LlavaProcessingInfo):
+        return LlavaMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
             cache=cache,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-    return LlavaMultiModalProcessor(
-        ctx,
-        cache=cache,
-        enable_sanity_checks=enable_sanity_checks,
-    )
+    raise NotImplementedError(type(info))
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -460,7 +459,9 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
+                                        info=_build_llava_or_pixtral_hf_info,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -727,11 +728,11 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
-        num_image_tokens = self._get_num_image_tokens(
+        num_image_tokens = self.info.get_num_image_tokens(
             image_width=-1,
             image_height=-1,
         )
@@ -796,6 +797,8 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 815456dac2a2f..fda4f22d366b1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -16,13 +17,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
-from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
-                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
+                    LlavaDummyInputsBuilder, LlavaLikeConfig,
                     LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
@@ -65,23 +65,23 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
     image_grid_pinpoints: Final[list[list[int]]]
 
 
-class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaNextLikeConfig:
+    def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,16 +140,13 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-
-class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
         for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
+            feat_size = self.get_num_image_tokens(image_width=width,
+                                                  image_height=height)
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
                 largest_feature_pinpoint = ImageSize(width=width,
@@ -161,11 +158,23 @@ def _get_image_size_with_most_features(self) -> ImageSize:
         return largest_feature_pinpoint
 
 
-class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
-                                   BaseLlavaMultiModalProcessor):
+_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
+
+
+class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextProfilingInfo(self.ctx)
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -179,7 +188,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
+                                        info=LlavaNextProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 6e82cee1c95a4..5be85d7c0f033 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -17,12 +17,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -47,33 +46,52 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoProcessingMixin(ProcessingMixin):
+class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_video_tokens = self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
     def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -87,37 +105,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-
-class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
-                                  BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_video_tokens = self._get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
-        )
-
-        return {"video": max_video_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
-        width = height = vision_encoder_info.get_image_size()
-        return ImageSize(width=width, height=height)
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -130,7 +125,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
@@ -138,6 +133,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
+
+class LlavaNextVideoDummyInputsBuilder(
+        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -145,16 +144,20 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "video":
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -165,11 +168,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
-                                        BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextVideoProfilingInfo(self.ctx)
+class LlavaNextVideoMultiModalProcessor(
+        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -184,7 +184,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_replacement(item_idx: int):
@@ -195,7 +195,7 @@ def get_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -269,7 +269,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextVideoMultiModalProcessor,
+    info=LlavaNextVideoProcessingInfo,
+    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
+)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index b5e3edba1f01c..78a47e64d9afc 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -17,19 +17,20 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
-from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
-                         LlavaNextProcessingMixin)
+from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
+from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
+                         LlavaNextProcessingInfo)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -89,14 +90,23 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
     video_token_index: Final[int]
 
 
-class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
+class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
@@ -141,16 +151,16 @@ def _get_num_frame_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -164,43 +174,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-
-class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
-                                  BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -213,12 +194,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
@@ -226,15 +207,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
+
+class LlavaOnevisionDummyInputsBuilder(
+        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -243,10 +228,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -257,7 +246,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -268,11 +257,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
-                                        LlavaNextMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaOnevisionProfilingInfo(self.ctx)
+class LlavaOnevisionMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -303,7 +289,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
@@ -345,7 +331,7 @@ def _get_prompt_replacements(
             out_mm_kwargs=out_mm_kwargs,
         )
 
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_video_replacement(item_idx: int):
@@ -356,7 +342,7 @@ def get_video_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -393,7 +379,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaOnevisionMultiModalProcessor,
+    info=LlavaOnevisionProcessingInfo,
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c8418c14e5fdf..a1b1af35604db 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,13 +34,12 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement,
-                                        _BoundPromptReplacement,
-                                        _PlaceholderInfo)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo,
+                                        BoundPromptReplacement,
+                                        PlaceholderInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -302,9 +301,9 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VProcessingMixin(ProcessingMixin):
+class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         num_crops: Optional[int] = None,
@@ -314,39 +313,42 @@ def _get_hf_processor(
 
         return self.ctx.get_hf_processor()
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        processor = self._get_hf_processor()
-
-        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
-            width=image_width,
-            height=image_height,
-        )
-
-
-class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        max_image_tokens = self._get_num_image_tokens(
+        max_image_tokens = self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            processor=None,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
         # Result in the max possible feature size (h:w = 16:1)
         return ImageSize(height=8000, width=50)
 
+
+class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -354,7 +356,8 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -363,7 +366,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         return ProcessorInputs(
@@ -372,10 +375,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Phi3VProfilingInfo(self.ctx)
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -416,10 +416,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
@@ -431,9 +431,10 @@ def get_replacement_phi3v(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    processor=hf_processor,
                 )
 
             return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
@@ -451,9 +452,9 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
             mm_prompt_repls=mm_prompt_repls,
@@ -466,7 +467,7 @@ def _apply_prompt_replacements(
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality=p.modality,
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
@@ -499,7 +500,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
+                                        info=Phi3VProcessingInfo,
+                                        dummy_inputs=Phi3VDummyInputsBuilder)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7012ddc66cd9c..0dff9595c6c08 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -38,11 +38,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -80,12 +80,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioProcessingMixin(ProcessingMixin):
+class Qwen2AudioProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -93,36 +93,37 @@ def _get_hf_processor(
     ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
+
+class Qwen2AudioDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -139,14 +140,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
-                                    BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2AudioProfilingInfo(self.ctx)
+class Qwen2AudioMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -161,7 +159,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor(**mm_kwargs)
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -194,7 +192,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -234,10 +232,13 @@ def _always_apply_prompt_replacements(self) -> bool:
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return not hasattr(self._get_hf_processor(), "audio_token")
+        return not hasattr(self.info.get_hf_processor(), "audio_token")
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2AudioMultiModalProcessor,
+    info=Qwen2AudioProcessingInfo,
+    dummy_inputs=Qwen2AudioDummyInputsBuilder)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a5c2fb9e84df3..8537fec854b6d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -57,11 +57,10 @@
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
-                                   MultiModalDataParser)
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -709,12 +708,12 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLProcessingMixin(ProcessingMixin):
+class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
@@ -736,18 +735,27 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_image_processor(
+    def get_image_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
     ):
-        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
+                                             max_pixels=max_pixels)
         image_processor = hf_processor.image_processor  # type: ignore
         assert isinstance(image_processor, Qwen2VLImageProcessor)
         return image_processor
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def _get_vision_info(
         self,
         *,
@@ -755,15 +763,17 @@ def _get_vision_info(
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> tuple[ImageSize, int]:
-        hf_config = self._get_hf_config()
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        image_processor = self._get_image_processor()
-
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
@@ -787,70 +797,65 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
+            image_processor=image_processor,
         )
         return num_image_tokens
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
         num_frames: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
+            image_processor=image_processor,
         )
         return num_video_tokens
 
-
-class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
+            image_processor=None,
         )
         return max_image_size
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            image_processor=None,
         )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
+                image_processor=None,
             )
 
             if next_max_tokens > max_tokens:
@@ -860,12 +865,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
@@ -877,15 +882,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return num_frames
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
         )
 
+
+class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -894,10 +903,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -908,7 +921,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -919,11 +932,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
-                                 BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2VLProfilingInfo(self.ctx)
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
+                                 ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
@@ -934,8 +944,9 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -991,7 +1002,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ecafd157b1d61..fada22d685dd6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -24,11 +24,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -59,9 +58,9 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxProcessingMixin(ProcessingMixin):
+class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -76,37 +75,38 @@ def _get_hf_processor(
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
         return hf_processor
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
 
         return {"audio": max_audio_tokens}
 
+
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
+                                 ):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -123,14 +123,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
-                                  BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return UltravoxProfilingInfo(self.ctx)
+class UltravoxMultiModalProcessor(
+        BaseMultiModalProcessor[UltravoxProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -141,7 +138,7 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
 
             prompt_ids = tokenizer.encode(
                 prompt,
@@ -160,7 +157,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
@@ -208,7 +205,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -342,7 +339,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
+                                        info=UltravoxProcessingInfo,
+                                        dummy_inputs=UltravoxDummyInputsBuilder
+                                        )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41113cd85bd16..c6a30cacebdd1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,12 +4,13 @@
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union)
 
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
-from vllm import envs
-from vllm.inputs import DummyData, InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
@@ -20,7 +21,9 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
-from .profiling import BaseProfilingInfo
+
+if TYPE_CHECKING:
+    from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
 
@@ -46,8 +49,8 @@ class PromptReplacement:
     if it does not depend on the input.
     """
 
-    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
-        return _BoundPromptReplacement(
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
+        return BoundPromptReplacement(
             tokenizer=tokenizer,
             modality=self.modality,
             _target=self.target,
@@ -128,7 +131,7 @@ def token_ids(self) -> list[int]:
 
 
 @dataclass
-class _BoundPromptReplacement:
+class BoundPromptReplacement:
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -207,7 +210,7 @@ def iter_token_matches(
 
 @dataclass(repr=False)
 class _PromptReplacementMatch(ABC):
-    prompt_repl: _BoundPromptReplacement
+    prompt_repl: BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -255,7 +258,7 @@ def end_idx(self) -> int:
 
 
 @dataclass
-class _PlaceholderInfo:
+class PlaceholderInfo:
     modality: str
     item_idx: int
     start_idx: int
@@ -274,7 +277,7 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -286,7 +289,7 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -390,9 +393,9 @@ def replace_text_matches(
 def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
-    modality_repls: Sequence[_BoundPromptReplacement],
+    modality_repls: Sequence[BoundPromptReplacement],
     modal_item_count: int,
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     if modal_item_count == 0:
         return
 
@@ -413,7 +416,7 @@ def _iter_modality_placeholders(
                 continue
 
             if prompt[start_idx:end_idx] == repl_tokens:
-                yield _PlaceholderInfo(
+                yield PlaceholderInfo(
                     modality=modality,
                     item_idx=item_idx,
                     start_idx=start_idx,
@@ -434,10 +437,10 @@ def _iter_modality_placeholders(
 
 
 def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     """
     For each modality, yield each set of placeholder tokens found in
     :code:`prompt`.
@@ -455,10 +458,10 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Mapping[str, list[_PlaceholderInfo]]:
+) -> Mapping[str, list[PlaceholderInfo]]:
     it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
@@ -524,29 +527,59 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class ProcessingMixin:
-    """
-    Contains helper functions to perform processing.
+class BaseProcessingInfo:
+    """Base class containing information to perform processing."""
 
-    Not to be confused with :class:`transformers.ProcessorMixin`.
-    """
-    ctx: InputProcessingContext
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
 
-    def _get_tokenizer(self) -> AnyTokenizer:
+        self.ctx = ctx
+
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+
+    def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_hf_config(self) -> PretrainedConfig:
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
         """
         return self.ctx.get_hf_processor(**kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
 
-class BaseMultiModalProcessor(ProcessingMixin, ABC):
+
+class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
@@ -554,18 +587,19 @@ class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
 
     def __init__(self,
-                 ctx: InputProcessingContext,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
-        self.ctx = ctx
+        self.info = info
+        self.dummy_inputs = dummy_inputs
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
         self.data_parser = self._get_data_parser()
-        self.profiling_info = self._get_profiling_info()
 
     def __call__(
         self,
@@ -585,13 +619,6 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        """
-        Get the profiling information to find the worst-case memory usage of
-        the model.
-        """
-        raise NotImplementedError
-
     def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
@@ -602,7 +629,7 @@ def _to_mm_items(
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
 
-        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
             limit = mm_limits.get(modality, 1)
             if len(items) > limit:
@@ -646,19 +673,19 @@ def _get_prompt_replacements(
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> Mapping[str, list[_PlaceholderInfo]]:
+    ) -> Mapping[str, list[PlaceholderInfo]]:
         return find_mm_placeholders(mm_prompt_repls, new_token_ids,
                                     mm_item_counts)
 
     def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
 
         for items in mm_items.values():
             processor_data.update(items.get_processor_data())
@@ -678,8 +705,8 @@ def _call_hf_processor(
         Call the HF processor on the prompt text and
         associated multi-modal data.
         """
-        return self.ctx.call_hf_processor(
-            self._get_hf_processor(**mm_kwargs),
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
             mm_kwargs,
         )
@@ -738,8 +765,8 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
-            self.ctx.model_config.max_model_len,
+        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
+            self.info.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
 
@@ -762,7 +789,7 @@ def _cached_apply_hf_processor(
         caching the results and reusing cached results.
         """
         cache = self.cache
-        model_id = self.ctx.model_config.model
+        model_id = self.info.model_id
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
@@ -838,8 +865,8 @@ def _cached_apply_hf_processor(
     def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> dict[str, list[_BoundPromptReplacement]]:
-        tokenizer = self._get_tokenizer()
+    ) -> dict[str, list[BoundPromptReplacement]]:
+        tokenizer = self.info.get_tokenizer()
 
         it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
         return dict(full_groupby_modality(it))
@@ -859,10 +886,10 @@ def _always_apply_prompt_replacements(self) -> bool:
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
-        tokenizer = self._get_tokenizer()
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+        tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
             modality: find_token_matches(token_ids, prompt_repls)
@@ -950,7 +977,7 @@ def _validate_mm_kwargs(
 
     def _validate_mm_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
         mm_item_counts: Mapping[str, int],
         *,
         allow_missing: bool = False,
@@ -1001,7 +1028,7 @@ def apply(
         # instead of rehashing.
 
         if envs.VLLM_USE_V1:
-            model_id = self.ctx.model_config.model
+            model_id = self.info.model_id
             mm_hashes = {
                 modality: [
                     MultiModalHasher.hash_kwargs(model_id=model_id,
@@ -1046,7 +1073,7 @@ def apply(
                 allow_missing=True,
             )
 
-            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
             for modality, missing_repl_count in mm_missing_repl_counts.items():
                 if missing_repl_count == 0:
                     mm_missing_repls[modality] = []
@@ -1059,7 +1086,7 @@ def apply(
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             prompt_text = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
@@ -1090,79 +1117,3 @@ def apply(
             mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
-
-    def _get_dummy_mm_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
-        profiling = self.profiling_info
-        processor_inputs = profiling.get_dummy_processor_inputs(
-            seq_len, mm_counts)
-
-        return self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
-    def get_dummy_data(self, seq_len: int) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        profiling = self.profiling_info
-        mm_counts = profiling.get_mm_limits()
-        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits`"
-                f"({set(mm_counts.keys())}) should be the same as those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        prompt_token_ids = mm_inputs["prompt_token_ids"]
-        placeholders_by_modality = mm_inputs["mm_placeholders"]
-
-        total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
-            for modality, placeholders in placeholders_by_modality.items()
-        }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-
-        total_len = len(prompt_token_ids)
-
-        # V0 does not support chunked prefill.
-        if total_len > seq_len and not envs.VLLM_USE_V1:
-            logger.warning(
-                "The context length (%d) of the model is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
-                "multi-modal embeddings). This may cause certain multi-modal "
-                "inputs to fail during inference, even when the input text is "
-                "short. To avoid this, you should increase `max_model_len`, "
-                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
-                total_len, total_placeholders_by_modality)
-
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
-
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
-        )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ecf0db1a485d..2ac3a6bcf3ddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Generic, TypeVar
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-from vllm.inputs import InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict
+from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
 
@@ -23,39 +25,19 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
-class BaseProfilingInfo(ABC):
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
     """
-    Abstract base class that provides the information necessary to profile
+    Abstract base class that constructs the dummy data to profile
     multi-modal models.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self, info: _I) -> None:
         super().__init__()
 
-        self.ctx = ctx
-
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
+        self.info = info
 
     @abstractmethod
     def get_dummy_processor_inputs(
@@ -64,8 +46,8 @@ def get_dummy_processor_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        Build the input which, after processing, results in
+        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 
@@ -99,11 +81,33 @@ def _get_dummy_videos(
         video = np.zeros((num_frames, width, height, 3))
         return [video] * num_videos
 
-    def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.ctx.get_mm_config()
+
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+
+        self.processor = processor
+
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+
+    def _get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
-        supported_mm_limits = self.get_supported_mm_limits()
+        supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
             modality: mm_limit_per_prompt.get(modality, 1)
@@ -119,3 +123,81 @@ def get_mm_limits(self) -> Mapping[str, int]:
                     f"at most {supported_limit} {modality} items.")
 
         return mm_limits
+
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+
+        return self.processor.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_counts = self._get_mm_limits()
+
+        info = self.processing_info
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
+
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
+
+        total_len = len(prompt_token_ids)
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index f75a594a4c4e0..5f01eac4edade 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,7 +1,8 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
-                    Sequence, Type, TypeVar)
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
+                    Protocol, Sequence, Type, TypeVar)
 
 import torch.nn as nn
 
@@ -14,7 +15,9 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor, ProcessingCache
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import BaseDummyInputsBuilder
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -27,20 +30,59 @@
 MM_CACHE_SIZE = 256
 
 N = TypeVar("N", bound=Type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
 
-class MultiModalProcessorFactory(Protocol):
+class ProcessingInfoFactory(Protocol[_I_co]):
     """Constructs a :class:`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
         ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+
+
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+
+
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
         cache: Optional[ProcessingCache] = None,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[_I]:
         ...
 
 
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+
+
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
@@ -71,7 +113,7 @@ def __init__(
         self._plugins = {p.get_data_key(): p for p in plugins}
 
         self._processor_factories = ClassRegistry[nn.Module,
-                                                  MultiModalProcessorFactory]()
+                                                  _ProcessorFactories]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
@@ -224,7 +266,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
+            return processor.info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
@@ -315,7 +357,10 @@ def get_mm_limits_per_prompt(
 
     def register_processor(
         self,
-        factory: MultiModalProcessorFactory,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
     ):
         """
         Register a multi-modal processor to a model class. The processor
@@ -336,7 +381,11 @@ def wrapper(model_cls: N) -> N:
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._processor_factories[model_cls] = factory
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
 
             return model_cls
 
@@ -359,15 +408,15 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
         model_cls = self._get_model_cls(model_config)
-        processor_factory = self._processor_factories[model_cls]
+        factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
         cache = (None if model_config.disable_mm_preprocessor_cache else
                  self._processing_cache)
 
-        return processor_factory(ctx, cache=cache)
+        return factories.build_processor(ctx, cache=cache)

From aba8d6ee006b78149ac4514f460e4038b2d4f607 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:09:53 +0000
Subject: [PATCH 039/115] [Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 +-
 .buildkite/test-pipeline.yaml                 | 26 +++++------
 .github/workflows/lint-and-deploy.yaml        |  4 +-
 Dockerfile                                    |  2 +-
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/deployment/frameworks/skypilot.md |  4 +-
 docs/source/features/disagg_prefill.md        |  2 +-
 docs/source/features/lora.md                  |  2 +-
 docs/source/features/quantization/auto_awq.md |  2 +-
 .../features/quantization/fp8_e4m3_kvcache.md |  2 +-
 docs/source/features/structured_outputs.md    |  4 +-
 docs/source/generate_examples.py              | 45 ++++++++++---------
 .../getting_started/installation/cpu-x86.md   |  4 +-
 .../getting_started/installation/xpu.md       |  2 +-
 docs/source/getting_started/quickstart.md     |  4 +-
 .../source/getting_started/troubleshooting.md |  2 +-
 docs/source/models/extensions/tensorizer.md   |  2 +-
 docs/source/models/generative_models.md       |  4 +-
 docs/source/models/pooling_models.md          |  6 +--
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/serving/multimodal_inputs.md      | 16 +++----
 .../serving/openai_compatible_server.md       | 10 ++---
 .../{ => offline_inference}/aqlm_example.py   |  0
 .../{ => offline_inference}/cpu_offload.py    |  0
 .../florence2_inference.py                    |  3 +-
 .../{ => offline_inference}/gguf_inference.py |  0
 .../llm_engine_example.py                     |  0
 .../lora_with_quantization_inference.py       |  0
 .../multilora_inference.py                    |  0
 .../offline_chat_with_tools.py                |  0
 .../offline_inference.py                      |  0
 .../offline_inference_arctic.py               |  0
 .../offline_inference_audio_language.py       |  0
 .../offline_inference_chat.py                 |  0
 .../offline_inference_classification.py       |  0
 .../offline_inference_cli.py                  |  0
 .../offline_inference_distributed.py          |  0
 .../offline_inference_embedding.py            |  0
 .../offline_inference_encoder_decoder.py      |  0
 .../offline_inference_mlpspeculator.py        |  0
 .../offline_inference_neuron.py               |  0
 ...line_inference_neuron_int8_quantization.py |  0
 .../offline_inference_openai.md               | 18 ++++----
 .../openai_example_batch.jsonl                |  0
 .../offline_inference_pixtral.py              |  0
 .../offline_inference_scoring.py              |  0
 .../offline_inference_structured_outputs.py   |  0
 .../offline_inference_tpu.py                  |  0
 .../offline_inference_vision_language.py      |  0
 ...ine_inference_vision_language_embedding.py |  0
 ...e_inference_vision_language_multi_image.py |  0
 .../offline_inference_whisper.py              |  0
 ...nference_with_default_generation_config.py |  0
 .../offline_inference_with_prefix.py          |  0
 .../offline_inference_with_profiler.py        |  0
 .../offline_profile.py                        |  2 +-
 .../save_sharded_state.py                     |  0
 examples/{ => online_serving}/api_client.py   |  0
 .../chart-helm/.helmignore                    |  0
 .../chart-helm/Chart.yaml                     |  0
 examples/online_serving/chart-helm/README.md  | 21 +++++++++
 .../{ => online_serving}/chart-helm/ct.yaml   |  0
 .../chart-helm/lintconf.yaml                  |  0
 .../chart-helm/templates/_helpers.tpl         |  0
 .../chart-helm/templates/configmap.yaml       |  0
 .../chart-helm/templates/custom-objects.yaml  |  0
 .../chart-helm/templates/deployment.yaml      |  0
 .../chart-helm/templates/hpa.yaml             |  0
 .../chart-helm/templates/job.yaml             |  0
 .../templates/poddisruptionbudget.yaml        |  0
 .../chart-helm/templates/pvc.yaml             |  0
 .../chart-helm/templates/secrets.yaml         |  0
 .../chart-helm/templates/service.yaml         |  0
 .../chart-helm/values.schema.json             |  0
 .../chart-helm/values.yaml                    |  0
 .../disaggregated_prefill.sh                  |  0
 .../gradio_openai_chatbot_webserver.py        |  0
 .../{ => online_serving}/gradio_webserver.py  |  0
 .../openai_chat_completion_client.py          |  0
 ...i_chat_completion_client_for_multimodal.py |  0
 ...penai_chat_completion_client_with_tools.py |  0
 ...enai_chat_completion_structured_outputs.py |  0
 ...ai_chat_embedding_client_for_multimodal.py |  0
 .../openai_completion_client.py               |  0
 .../openai_cross_encoder_score.py             |  0
 .../openai_embedding_client.py                |  0
 .../openai_pooling_client.py                  |  0
 .../opentelemetry/Otel.md                     |  0
 .../opentelemetry/dummy_client.py             |  0
 .../prometheus_grafana/README.md              |  0
 .../prometheus_grafana/docker-compose.yaml    |  0
 .../prometheus_grafana/grafana.json           |  0
 .../prometheus_grafana/prometheus.yaml        |  0
 examples/{ => online_serving}/run_cluster.sh  |  0
 .../sagemaker-entrypoint.sh                   |  0
 examples/{ => other}/fp8/README.md            | 10 ++---
 examples/{ => other}/fp8/extract_scales.py    |  0
 examples/{ => other}/fp8/quantizer/README.md  |  0
 .../{ => other}/fp8/quantizer/quantize.py     |  0
 examples/{ => other}/logging_configuration.md |  0
 examples/{ => other}/tensorize_vllm_model.py  | 10 ++---
 pyproject.toml                                |  2 +-
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py | 10 ++---
 vllm/distributed/kv_transfer/README.md        |  2 +-
 vllm/model_executor/model_loader/loader.py    | 11 ++---
 .../model_executor/model_loader/tensorizer.py | 14 +++---
 .../model_loader/weight_utils.py              |  3 +-
 116 files changed, 153 insertions(+), 124 deletions(-)
 rename examples/{ => offline_inference}/aqlm_example.py (100%)
 rename examples/{ => offline_inference}/cpu_offload.py (100%)
 rename examples/{ => offline_inference}/florence2_inference.py (92%)
 rename examples/{ => offline_inference}/gguf_inference.py (100%)
 rename examples/{ => offline_inference}/llm_engine_example.py (100%)
 rename examples/{ => offline_inference}/lora_with_quantization_inference.py (100%)
 rename examples/{ => offline_inference}/multilora_inference.py (100%)
 rename examples/{ => offline_inference}/offline_chat_with_tools.py (100%)
 rename examples/{ => offline_inference}/offline_inference.py (100%)
 rename examples/{ => offline_inference}/offline_inference_arctic.py (100%)
 rename examples/{ => offline_inference}/offline_inference_audio_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_chat.py (100%)
 rename examples/{ => offline_inference}/offline_inference_classification.py (100%)
 rename examples/{ => offline_inference}/offline_inference_cli.py (100%)
 rename examples/{ => offline_inference}/offline_inference_distributed.py (100%)
 rename examples/{ => offline_inference}/offline_inference_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_encoder_decoder.py (100%)
 rename examples/{ => offline_inference}/offline_inference_mlpspeculator.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron_int8_quantization.py (100%)
 rename examples/{ => offline_inference/offline_inference_openai}/offline_inference_openai.md (90%)
 rename examples/{ => offline_inference/offline_inference_openai}/openai_example_batch.jsonl (100%)
 rename examples/{ => offline_inference}/offline_inference_pixtral.py (100%)
 rename examples/{ => offline_inference}/offline_inference_scoring.py (100%)
 rename examples/{ => offline_inference}/offline_inference_structured_outputs.py (100%)
 rename examples/{ => offline_inference}/offline_inference_tpu.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_multi_image.py (100%)
 rename examples/{ => offline_inference}/offline_inference_whisper.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_default_generation_config.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_prefix.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_profiler.py (100%)
 rename examples/{ => offline_inference}/offline_profile.py (99%)
 rename examples/{ => offline_inference}/save_sharded_state.py (100%)
 rename examples/{ => online_serving}/api_client.py (100%)
 rename examples/{ => online_serving}/chart-helm/.helmignore (100%)
 rename examples/{ => online_serving}/chart-helm/Chart.yaml (100%)
 create mode 100644 examples/online_serving/chart-helm/README.md
 rename examples/{ => online_serving}/chart-helm/ct.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/lintconf.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/_helpers.tpl (100%)
 rename examples/{ => online_serving}/chart-helm/templates/configmap.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/custom-objects.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/deployment.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/hpa.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/job.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/poddisruptionbudget.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/pvc.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/secrets.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/service.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/values.schema.json (100%)
 rename examples/{ => online_serving}/chart-helm/values.yaml (100%)
 rename examples/{ => online_serving}/disaggregated_prefill.sh (100%)
 rename examples/{ => online_serving}/gradio_openai_chatbot_webserver.py (100%)
 rename examples/{ => online_serving}/gradio_webserver.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_with_tools.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_structured_outputs.py (100%)
 rename examples/{ => online_serving}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_cross_encoder_score.py (100%)
 rename examples/{ => online_serving}/openai_embedding_client.py (100%)
 rename examples/{ => online_serving}/openai_pooling_client.py (100%)
 rename examples/{ => online_serving}/opentelemetry/Otel.md (100%)
 rename examples/{ => online_serving}/opentelemetry/dummy_client.py (100%)
 rename examples/{ => online_serving}/prometheus_grafana/README.md (100%)
 rename examples/{ => online_serving}/prometheus_grafana/docker-compose.yaml (100%)
 rename examples/{ => online_serving}/prometheus_grafana/grafana.json (100%)
 rename examples/{ => online_serving}/prometheus_grafana/prometheus.yaml (100%)
 rename examples/{ => online_serving}/run_cluster.sh (100%)
 rename examples/{ => online_serving}/sagemaker-entrypoint.sh (100%)
 rename examples/{ => other}/fp8/README.md (88%)
 rename examples/{ => other}/fp8/extract_scales.py (100%)
 rename examples/{ => other}/fp8/quantizer/README.md (100%)
 rename examples/{ => other}/fp8/quantizer/quantize.py (100%)
 rename examples/{ => other}/logging_configuration.md (100%)
 rename examples/{ => other}/tensorize_vllm_model.py (96%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a4eca078568fd..87d08c8c7fdcb 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference.py"
+    python3 examples/offline_inference/offline_inference.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 4fc6d089cc666..1e5ff77895a38 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index fa4f74fca7a11..a50570ab53438 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index aa29c434e7cfb..52d485939b1d0 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 6b12f424fd828..380f7a44a429a 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 770dad6ffa3a1..13605a3e97142 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index e0a12afbe7320..160e10aa3bb9b 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference.py
-    python3 examples/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/offline_inference_cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dcfe228ce8eae..b7178b94f481a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
-    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
-    - python3 offline_inference_classification.py
-    - python3 offline_inference_embedding.py
-    - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/offline_inference_chat.py
+    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/offline_inference_vision_language.py
+    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/offline_inference_encoder_decoder.py
+    - python3 offline_inference/offline_inference_classification.py
+    - python3 offline_inference/offline_inference_embedding.py
+    - python3 offline_inference/offline_inference_scoring.py
+    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ab6f6e5d2060d..ee768db63c96c 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -27,7 +27,7 @@ jobs:
           version: v3.10.1
 
       - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
 
       - name: Setup minio
         run: |
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |
diff --git a/Dockerfile b/Dockerfile
index 088314eb38dbe..808cf675acf4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/sagemaker-entrypoint.sh .
+COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 46210957c19ec..97de40ff469f1 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index f02a943026922..657e7f2bc72cc 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -61,7 +61,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://localhost:8081/v1 \
@@ -321,7 +321,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://$ENDPOINT/v1 \
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 645dc60807dd3..efa2efc66192e 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index cf06916d70f44..b00d05147bb32 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -47,7 +47,7 @@ outputs = llm.generate(
 )
 ```
 
-Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index c02fbf0605a8c..3679595e3d4d0 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index f200c722d1d42..50edaf81fddd3 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
 
 ```python
 # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
 
 from vllm import LLM, SamplingParams
 sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 26c09bb0d8a0c..ccd9a6a1b1a14 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 32bb86c469c78..aaa13d0fb6d3f 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -12,6 +12,7 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
+        "Cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
@@ -58,7 +59,7 @@ def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
         content += "```{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        content += "\n".join(self.documents) + "\n```\n"
         return content
 
 
@@ -131,11 +132,14 @@ def generate(self) -> str:
             ROOT_DIR)
 
         content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        if self.main_file.suffix == ".py":
-            content += f"# {self.title}\n\n"
         include = "include" if self.main_file.suffix == ".md" else \
             "literalinclude"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
 
         if not self.other_files:
             return content
@@ -163,14 +167,16 @@ def generate_examples():
         description=
         "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
         caption="Examples",
-        maxdepth=1)  # TODO change to 2 when examples start being categorised
+        maxdepth=2)
+    # Category indices stored in reverse order because they are inserted into
+    # examples_index.documents at index 0 in order
     category_indices = {
-        "offline_inference":
+        "other":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Offline Inference",
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
             caption="Examples",
         ),
         "online_serving":
@@ -181,31 +187,30 @@ def generate_examples():
             "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
             caption="Examples",
         ),
-        "other":
+        "offline_inference":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Other",
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
             description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
             caption="Examples",
         ),
     }
 
     examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in category_indices:
         category_dir = EXAMPLE_DIR / category
-        py = category_dir.glob("*.py")
-        md = category_dir.glob("*.md")
-        for path in itertools.chain(py, md):
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path, category))
         # Find examples in subdirectories
         for path in category_dir.glob("*/*.md"):
             examples.append(Example(path.parent, category))
     # Find uncategorised examples
-    py = EXAMPLE_DIR.glob("*.py")
-    md = EXAMPLE_DIR.glob("*.md")
-    for path in itertools.chain(py, md):
+    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
+    for path in itertools.chain(*globs):
         examples.append(Example(path))
     # Find examples in subdirectories
     for path in EXAMPLE_DIR.glob("*/*.md"):
@@ -215,7 +220,7 @@ def generate_examples():
         examples.append(Example(path.parent))
 
     # Generate the example documentation
-    for example in examples:
+    for example in sorted(examples, key=lambda e: e.path.stem):
         doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
             f.write(example.generate())
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bbb2d1872ef39..bb046dd0fd9dc 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference.py # run vLLM
+$ python examples/offline_inference/offline_inference.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference.py
+$ python examples/offline_inference/offline_inference.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index be4e3b9bd1bc5..c1ab5478eb652 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 3f9556165ece4..6b56918ce5638 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
@@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
 print("Completion result:", completion)
 ```
 
-A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
+A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 5a0310da0f2cb..f5efe0bef7506 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Model is too large
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 42ed5c795dd27..ae17e3437bca6 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
 ```{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 383299d61b5dd..6228c7c2ac957 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 12ded68eb30b5..3e4407cfdc233 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -65,7 +65,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
 
 ### `LLM.classify`
 
@@ -80,7 +80,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
 
 ### `LLM.score`
 
@@ -102,7 +102,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
 ## Online Inference
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index b1703249d7224..4e0a9ef6ecf7d 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0efa09f2869ca..9f5e1b908d786 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
 
 ### Embedding
 
@@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
 print("Chat completion output:", chat_response.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ```{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from image url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from audio url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
 example below for details.
 ```
 
-Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1e5ea6357d202..022dd3ae8a237 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: <gh-file:examples/openai_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
@@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: <gh-file:examples/openai_chat_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
@@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
-Code example: <gh-file:examples/openai_embedding_client.py>
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
@@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: <gh-file:examples/openai_pooling_client.py>
+Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
 ### Score API
@@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: <gh-file:examples/openai_cross_encoder_score.py>
+Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
 #### Single inference
 
diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py
similarity index 100%
rename from examples/aqlm_example.py
rename to examples/offline_inference/aqlm_example.py
diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py
similarity index 100%
rename from examples/cpu_offload.py
rename to examples/offline_inference/cpu_offload.py
diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py
similarity index 92%
rename from examples/florence2_inference.py
rename to examples/offline_inference/florence2_inference.py
index b58ac2e1f7ed4..49dd2c331db5a 100644
--- a/examples/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,8 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference_vision_language.py after porting vision backbone
+# Move to offline_inference/offline_inference_vision_language.py
+# after porting vision backbone
 from vllm import LLM, SamplingParams
 
 dtype = "float"
diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py
similarity index 100%
rename from examples/gguf_inference.py
rename to examples/offline_inference/gguf_inference.py
diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
similarity index 100%
rename from examples/llm_engine_example.py
rename to examples/offline_inference/llm_engine_example.py
diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
similarity index 100%
rename from examples/lora_with_quantization_inference.py
rename to examples/offline_inference/lora_with_quantization_inference.py
diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py
similarity index 100%
rename from examples/multilora_inference.py
rename to examples/offline_inference/multilora_inference.py
diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py
similarity index 100%
rename from examples/offline_chat_with_tools.py
rename to examples/offline_inference/offline_chat_with_tools.py
diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py
similarity index 100%
rename from examples/offline_inference.py
rename to examples/offline_inference/offline_inference.py
diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py
similarity index 100%
rename from examples/offline_inference_arctic.py
rename to examples/offline_inference/offline_inference_arctic.py
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py
similarity index 100%
rename from examples/offline_inference_audio_language.py
rename to examples/offline_inference/offline_inference_audio_language.py
diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py
similarity index 100%
rename from examples/offline_inference_chat.py
rename to examples/offline_inference/offline_inference_chat.py
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py
similarity index 100%
rename from examples/offline_inference_classification.py
rename to examples/offline_inference/offline_inference_classification.py
diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py
similarity index 100%
rename from examples/offline_inference_cli.py
rename to examples/offline_inference/offline_inference_cli.py
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py
similarity index 100%
rename from examples/offline_inference_distributed.py
rename to examples/offline_inference/offline_inference_distributed.py
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py
similarity index 100%
rename from examples/offline_inference_embedding.py
rename to examples/offline_inference/offline_inference_embedding.py
diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py
similarity index 100%
rename from examples/offline_inference_encoder_decoder.py
rename to examples/offline_inference/offline_inference_encoder_decoder.py
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py
similarity index 100%
rename from examples/offline_inference_mlpspeculator.py
rename to examples/offline_inference/offline_inference_mlpspeculator.py
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py
similarity index 100%
rename from examples/offline_inference_neuron.py
rename to examples/offline_inference/offline_inference_neuron.py
diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
similarity index 90%
rename from examples/offline_inference_openai.md
rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md
index 2436417cb543a..6278a1943fe4a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/openai_example_batch.jsonl
rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py
similarity index 100%
rename from examples/offline_inference_pixtral.py
rename to examples/offline_inference/offline_inference_pixtral.py
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py
similarity index 100%
rename from examples/offline_inference_scoring.py
rename to examples/offline_inference/offline_inference_scoring.py
diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py
similarity index 100%
rename from examples/offline_inference_structured_outputs.py
rename to examples/offline_inference/offline_inference_structured_outputs.py
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py
similarity index 100%
rename from examples/offline_inference_tpu.py
rename to examples/offline_inference/offline_inference_tpu.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py
similarity index 100%
rename from examples/offline_inference_vision_language.py
rename to examples/offline_inference/offline_inference_vision_language.py
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/offline_inference_vision_language_embedding.py
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/offline_inference_vision_language_multi_image.py
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py
similarity index 100%
rename from examples/offline_inference_whisper.py
rename to examples/offline_inference/offline_inference_whisper.py
diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py
similarity index 100%
rename from examples/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/offline_inference_with_default_generation_config.py
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py
similarity index 100%
rename from examples/offline_inference_with_prefix.py
rename to examples/offline_inference/offline_inference_with_prefix.py
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py
similarity index 100%
rename from examples/offline_inference_with_profiler.py
rename to examples/offline_inference/offline_inference_with_profiler.py
diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py
similarity index 99%
rename from examples/offline_profile.py
rename to examples/offline_inference/offline_profile.py
index 46afe8aa2604b..187a05e4d70a2 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_inference/offline_profile.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_profile.py \\
+    python examples/offline_inference/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
similarity index 100%
rename from examples/save_sharded_state.py
rename to examples/offline_inference/save_sharded_state.py
diff --git a/examples/api_client.py b/examples/online_serving/api_client.py
similarity index 100%
rename from examples/api_client.py
rename to examples/online_serving/api_client.py
diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore
similarity index 100%
rename from examples/chart-helm/.helmignore
rename to examples/online_serving/chart-helm/.helmignore
diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml
similarity index 100%
rename from examples/chart-helm/Chart.yaml
rename to examples/online_serving/chart-helm/Chart.yaml
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
new file mode 100644
index 0000000000000..6aa126d4fd22c
--- /dev/null
+++ b/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,21 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
\ No newline at end of file
diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml
similarity index 100%
rename from examples/chart-helm/ct.yaml
rename to examples/online_serving/chart-helm/ct.yaml
diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml
similarity index 100%
rename from examples/chart-helm/lintconf.yaml
rename to examples/online_serving/chart-helm/lintconf.yaml
diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
similarity index 100%
rename from examples/chart-helm/templates/_helpers.tpl
rename to examples/online_serving/chart-helm/templates/_helpers.tpl
diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml
similarity index 100%
rename from examples/chart-helm/templates/configmap.yaml
rename to examples/online_serving/chart-helm/templates/configmap.yaml
diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml
similarity index 100%
rename from examples/chart-helm/templates/custom-objects.yaml
rename to examples/online_serving/chart-helm/templates/custom-objects.yaml
diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
similarity index 100%
rename from examples/chart-helm/templates/deployment.yaml
rename to examples/online_serving/chart-helm/templates/deployment.yaml
diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml
similarity index 100%
rename from examples/chart-helm/templates/hpa.yaml
rename to examples/online_serving/chart-helm/templates/hpa.yaml
diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
similarity index 100%
rename from examples/chart-helm/templates/job.yaml
rename to examples/online_serving/chart-helm/templates/job.yaml
diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
similarity index 100%
rename from examples/chart-helm/templates/poddisruptionbudget.yaml
rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml
similarity index 100%
rename from examples/chart-helm/templates/pvc.yaml
rename to examples/online_serving/chart-helm/templates/pvc.yaml
diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml
similarity index 100%
rename from examples/chart-helm/templates/secrets.yaml
rename to examples/online_serving/chart-helm/templates/secrets.yaml
diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml
similarity index 100%
rename from examples/chart-helm/templates/service.yaml
rename to examples/online_serving/chart-helm/templates/service.yaml
diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
similarity index 100%
rename from examples/chart-helm/values.schema.json
rename to examples/online_serving/chart-helm/values.schema.json
diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
similarity index 100%
rename from examples/chart-helm/values.yaml
rename to examples/online_serving/chart-helm/values.yaml
diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
similarity index 100%
rename from examples/disaggregated_prefill.sh
rename to examples/online_serving/disaggregated_prefill.sh
diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
similarity index 100%
rename from examples/gradio_openai_chatbot_webserver.py
rename to examples/online_serving/gradio_openai_chatbot_webserver.py
diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
similarity index 100%
rename from examples/gradio_webserver.py
rename to examples/online_serving/gradio_webserver.py
diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chat_completion_client.py
rename to examples/online_serving/openai_chat_completion_client.py
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_completion_client_for_multimodal.py
rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
similarity index 100%
rename from examples/openai_chat_completion_client_with_tools.py
rename to examples/online_serving/openai_chat_completion_client_with_tools.py
diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
similarity index 100%
rename from examples/openai_chat_completion_structured_outputs.py
rename to examples/online_serving/openai_chat_completion_structured_outputs.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_embedding_client_for_multimodal.py
rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
similarity index 100%
rename from examples/openai_completion_client.py
rename to examples/online_serving/openai_completion_client.py
diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
similarity index 100%
rename from examples/openai_cross_encoder_score.py
rename to examples/online_serving/openai_cross_encoder_score.py
diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
similarity index 100%
rename from examples/openai_embedding_client.py
rename to examples/online_serving/openai_embedding_client.py
diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
similarity index 100%
rename from examples/openai_pooling_client.py
rename to examples/online_serving/openai_pooling_client.py
diff --git a/examples/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
similarity index 100%
rename from examples/opentelemetry/Otel.md
rename to examples/online_serving/opentelemetry/Otel.md
diff --git a/examples/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/opentelemetry/dummy_client.py
rename to examples/online_serving/opentelemetry/dummy_client.py
diff --git a/examples/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
similarity index 100%
rename from examples/prometheus_grafana/README.md
rename to examples/online_serving/prometheus_grafana/README.md
diff --git a/examples/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/prometheus_grafana/docker-compose.yaml
rename to examples/online_serving/prometheus_grafana/docker-compose.yaml
diff --git a/examples/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/prometheus_grafana/grafana.json
rename to examples/online_serving/prometheus_grafana/grafana.json
diff --git a/examples/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/prometheus_grafana/prometheus.yaml
rename to examples/online_serving/prometheus_grafana/prometheus.yaml
diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh
similarity index 100%
rename from examples/run_cluster.sh
rename to examples/online_serving/run_cluster.sh
diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
similarity index 100%
rename from examples/sagemaker-entrypoint.sh
rename to examples/online_serving/sagemaker-entrypoint.sh
diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md
similarity index 88%
rename from examples/fp8/README.md
rename to examples/other/fp8/README.md
index 5492872cae93a..4e8031d954113 100644
--- a/examples/fp8/README.md
+++ b/examples/other/fp8/README.md
@@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t
 ### 2. Convert HF model into a quantized HF model.
 Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
 
-`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
 
-The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
 
 ### 3. Extract KV Cache Scaling Factors from quantized HF model.
-`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
 
 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
@@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a
 ```python
 # prerequisites:
 # - Quantized HF LLaMa 2 model 
-python3 examples/fp8/extract_scales.py --help
+python3 examples/other/fp8/extract_scales.py --help
 Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
 
 KV Scale Extraction Example
@@ -52,7 +52,7 @@ Optional arguments:
 ```
 ```python
 Example:
-python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
similarity index 100%
rename from examples/fp8/extract_scales.py
rename to examples/other/fp8/extract_scales.py
diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
similarity index 100%
rename from examples/fp8/quantizer/README.md
rename to examples/other/fp8/quantizer/README.md
diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
similarity index 100%
rename from examples/fp8/quantizer/quantize.py
rename to examples/other/fp8/quantizer/quantize.py
diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md
similarity index 100%
rename from examples/logging_configuration.md
rename to examples/other/logging_configuration.md
diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
similarity index 96%
rename from examples/tensorize_vllm_model.py
rename to examples/other/tensorize_vllm_model.py
index dd77a4ad0c6b7..5fff1fdf502c9 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -25,7 +25,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -45,7 +45,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -63,11 +63,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.tensorize_vllm_model serialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.tensorize_vllm_model deserialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -88,7 +88,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.tensorize_vllm_model deserialize --help`
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
diff --git a/pyproject.toml b/pyproject.toml
index 45fa4bff4e680..0ac3f39ef7a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta"
 line-length = 80
 exclude = [
     # External file, leaving license intact
-    "examples/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py"
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 0d27cf9f152e0..57518bd3e8299 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference.py")
+        "examples", "offline_inference/offline_inference.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 0b0792b6b845f..bf409d2d97aa1 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     multilora_inference = import_from_path(
-        "examples.multilora_inference",
-        EXAMPLES_PATH / "multilora_inference.py",
+        "examples.offline_inference.multilora_inference",
+        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
     )
 
     model_ref = "meta-llama/Llama-2-7b-hf"
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 394ca8663e189..49366abc7fb56 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_profile.py")
+                        "examples/offline_inference/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index da7a28da15c19..fa88ed4204d8f 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        "--json-trace",
-        type=str,
-        required=True,
-        help="json trace file output by examples/offline_profile.py")
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by \
+                              examples/offline_inference/offline_profile.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index dab2d10c4c9d0..e20c992a381a3 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggretgated prefilling.
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a9c1fa7221217..0033fbff0e9ac 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -452,9 +452,9 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/tensorize_vllm_model.py) This should still be faster than
-        default HuggingFace loading, but will be slower than loading a
-        vLLM-tensorized model.
+        examples/other/tensorize_vllm_model.py) This should still
+        be faster than default HuggingFace loading, but will be slower than
+        loading a vLLM-tensorized model.
         """
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
@@ -472,7 +472,7 @@ def _load_model_serialized(
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/tensorize_vllm_model.py example script
+        examples/other/tensorize_vllm_model.py example script
         for serializing vLLM models."""
 
         device_config = vllm_config.device_config
@@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/save_sharded_state.py` for creating a sharded checkpoint.
+    `examples/offline_inference/save_sharded_state.py` for creating a sharded
+    checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 8b929f299c8d8..fbd4937112e11 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -155,7 +155,7 @@ class TensorizerArgs:
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
-          examples/tensorize_vllm_model.py. 
+          examples/other/tensorize_vllm_model.py. 
       s3_access_key_id: The access key for the S3 bucket. Can also be set via
           the S3_ACCESS_KEY_ID environment variable.
       s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -363,12 +363,12 @@ def deserialize(self):
 def tensorizer_weights_iterator(
     tensorizer_args: "TensorizerArgs"
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    logger.warning(
-        "Deserializing HuggingFace models is not optimized for "
-        "loading on vLLM, as tensorizer is forced to load to CPU. "
-        "Consider deserializing a vLLM model instead for faster "
-        "load times. See the examples/tensorize_vllm_model.py example "
-        "script for serializing vLLM models.")
+    logger.warning("Deserializing HuggingFace models is not optimized for "
+                   "loading on vLLM, as tensorizer is forced to load to CPU. "
+                   "Consider deserializing a vLLM model instead for faster "
+                   "load times. See the "
+                   "examples/other/tensorize_vllm_model.py example script "
+                   "for serializing vLLM models.")
 
     deserializer_args = tensorizer_args.deserializer_params
     stream_params = tensorizer_args.stream_params
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8aa0c98df70d2..a2c991cfdb74e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -503,7 +503,8 @@ def kv_cache_scales_loader(
     KV cache scaling factors. The serialization should represent a dictionary
     whose keys are the TP ranks and values are another dictionary mapping layers
     to their KV cache scaling factors.
-    Keep this function in sync with the output of examples/fp8/extract_scales.py
+    Keep this function in sync with the output of
+    examples/other/fp8/extract_scales.py
     """
     try:
         with open(filename) as f:

From 6cd40a5bfed24ef0ceca83b0450be6920d8ca6d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 21:34:44 +0800
Subject: [PATCH 040/115] [Doc][4/N] Reorganize API Reference (#11843)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 Dockerfile                                    |   4 ++--
 .../{dev => api}/engine/async_llm_engine.md   |   0
 .../engine_index.md => api/engine/index.md}   |   0
 docs/source/{dev => api}/engine/llm_engine.md |   0
 .../multimodal/index.md}                      |  10 --------
 .../offline_inference/index.md}               |   0
 .../{dev => api}/offline_inference/llm.md     |   0
 .../offline_inference/llm_inputs.md           |   0
 docs/source/api/params.md                     |  22 ++++++++++++++++++
 .../dockerfile-stages-dependency.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 .../multimodal/adding_multimodal_plugin.md    |  16 -------------
 docs/source/dev/pooling_params.md             |   6 -----
 docs/source/dev/sampling_params.md            |   6 -----
 docs/source/getting_started/quickstart.md     |   2 +-
 docs/source/index.md                          |   9 ++++---
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |   8 +++----
 vllm/multimodal/base.py                       |   3 ---
 vllm/multimodal/inputs.py                     |   6 -----
 vllm/multimodal/registry.py                   |   3 ---
 vllm/pooling_params.py                        |   2 +-
 24 files changed, 38 insertions(+), 67 deletions(-)
 rename docs/source/{dev => api}/engine/async_llm_engine.md (100%)
 rename docs/source/{dev/engine/engine_index.md => api/engine/index.md} (100%)
 rename docs/source/{dev => api}/engine/llm_engine.md (100%)
 rename docs/source/{design/multimodal/multimodal_index.md => api/multimodal/index.md} (84%)
 rename docs/source/{dev/offline_inference/offline_index.md => api/offline_inference/index.md} (100%)
 rename docs/source/{dev => api}/offline_inference/llm.md (100%)
 rename docs/source/{dev => api}/offline_inference/llm_inputs.md (100%)
 create mode 100644 docs/source/api/params.md
 rename docs/source/assets/{dev => contributing}/dockerfile-stages-dependency.png (100%)
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/dev/pooling_params.md
 delete mode 100644 docs/source/dev/sampling_params.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7178b94f481a..f883595f6d9ad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/Dockerfile b/Dockerfile
index 808cf675acf4d..4542bc9cf0bd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/async_llm_engine.md
rename to docs/source/api/engine/async_llm_engine.md
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md
similarity index 100%
rename from docs/source/dev/engine/engine_index.md
rename to docs/source/api/engine/index.md
diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/llm_engine.md
rename to docs/source/api/engine/llm_engine.md
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/api/multimodal/index.md
similarity index 84%
rename from docs/source/design/multimodal/multimodal_index.md
rename to docs/source/api/multimodal/index.md
index e4f2171e84ff7..0046b73ea825e 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/api/multimodal/index.md
@@ -11,18 +11,8 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following [this guide](#adding-multimodal-plugin).
-
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-adding_multimodal_plugin
-```
 
 ## Module Contents
 
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md
similarity index 100%
rename from docs/source/dev/offline_inference/offline_index.md
rename to docs/source/api/offline_inference/index.md
diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm.md
rename to docs/source/api/offline_inference/llm.md
diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm_inputs.md
rename to docs/source/api/offline_inference/llm_inputs.md
diff --git a/docs/source/api/params.md b/docs/source/api/params.md
new file mode 100644
index 0000000000000..a3b4d9cbb44ec
--- /dev/null
+++ b/docs/source/api/params.md
@@ -0,0 +1,22 @@
+# Optional Parameters
+
+Optional parameters for vLLM APIs.
+
+(sampling-params)=
+
+## Sampling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.SamplingParams
+    :members:
+```
+
+(pooling-params)=
+
+## Pooling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.PoolingParams
+    :members:
+```
+
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/dev/dockerfile-stages-dependency.png
rename to docs/source/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 38ea956ba8dfb..cb142318b8724 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,7 +17,7 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 5e0dd021ad02e..cec503ef2f77d 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -53,7 +53,7 @@ for output in outputs:
 ```
 
 More API details can be found in the {doc}`Offline Inference
-</dev/offline_inference/offline_index>` section of the API docs.
+</api/offline_inference/index>` section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
deleted file mode 100644
index bcccd284879bb..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.md
+++ /dev/null
@@ -1,16 +0,0 @@
-(adding-multimodal-plugin)=
-
-# Adding a Multimodal Plugin
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
-
-```{note}
-This article is a work in progress.
-```
-
-% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md
deleted file mode 100644
index 74b2c57443e4b..0000000000000
--- a/docs/source/dev/pooling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md
deleted file mode 100644
index bdc36af5153db..0000000000000
--- a/docs/source/dev/sampling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6b56918ce5638..2808e1b386801 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
 
 ```python
 prompts = [
diff --git a/docs/source/index.md b/docs/source/index.md
index 11d3e24a9b60a..6747a7fcce4fe 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -137,10 +137,10 @@ community/sponsors
 :caption: API Reference
 :maxdepth: 2
 
-dev/sampling_params
-dev/pooling_params
-dev/offline_inference/offline_index
-dev/engine/engine_index
+api/offline_inference/index
+api/engine/index
+api/multimodal/index
+api/params
 ```
 
 % Design Documents: Details about vLLM internals
@@ -154,7 +154,6 @@ design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
 design/input_processing/model_inputs_index
-design/multimodal/multimodal_index
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 83178f7811825..79092ab208784 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 ```{seealso}
-[API Reference](/dev/offline_inference/offline_index)
+[API Reference](/api/offline_inference/index)
 ```
 
 ## Configuration Options
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 022dd3ae8a237..ec5a367594743 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -195,7 +195,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -226,7 +226,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -259,7 +259,7 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -447,7 +447,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7f4029e726332..4941fbac963ca 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -49,9 +49,6 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-
-    See also:
-        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8fdcc4b524035..d542461874866 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,12 +99,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f01eac4edade..9eceefb08c93f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -125,9 +125,6 @@ def __init__(
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
-
-        See also:
-            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 2635c0bccd1c4..b24b7e91a7ae7 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for embeddings API.
+    """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
         additional_data: Any additional data needed for pooling.

From 2f7024987e582b85b280909b87287668cd97c92f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 8 Jan 2025 23:18:28 +0800
Subject: [PATCH 041/115] [CI/Build][Bugfix] Fix CPU CI image clean up (#11836)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 7 ++-----
 vllm/model_executor/layers/activation.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 87d08c8c7fdcb..1a4dae8f65e99 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,7 +13,7 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BU
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
@@ -35,10 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install pytest pytest-asyncio \
-      decord einops librosa peft Pillow sentence-transformers soundfile \
-      transformers_stream_generator matplotlib datamodel_code_generator
-    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pip install -r vllm/requirements-test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 46d4670bfe4f9..b8a302cf5087f 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -61,7 +61,7 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex

From 78f4590b60161dee1a444870ae682ba45f633502 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 9 Jan 2025 00:11:50 +0800
Subject: [PATCH 042/115] [Bugfix][XPU] fix silu_and_mul (#11823)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 vllm/model_executor/layers/activation.py | 4 ++--
 vllm/plugins/__init__.py                 | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index b8a302cf5087f..32456fee06a28 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -64,8 +64,8 @@ def __init__(self):
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-            self.op = ipex.llm.functional.silu_and_mul
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index c50eb2cef4cd5..e5fa4f0e4a2f6 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -63,8 +63,8 @@ def load_general_plugins():
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
-        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
     if current_platform.is_hpu():
         # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
         # does not support torch.compile
@@ -72,7 +72,6 @@ def load_general_plugins():
         # torch.compile support
         is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
         if is_lazy:
-            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
             torch._dynamo.config.disable = True
             # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
             # requires enabling lazy collectives

From ca47e176af9e0a4fa9f02325cdad5f11b40aedab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:04:46 +0800
Subject: [PATCH 043/115] [Misc] Move some model utils into vision file
 (#11848)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/clip.py     |  5 +-
 vllm/model_executor/models/pixtral.py  |  5 +-
 vllm/model_executor/models/qwen2_vl.py |  3 +-
 vllm/model_executor/models/siglip.py   |  5 +-
 vllm/model_executor/models/utils.py    | 37 +-----------
 vllm/model_executor/models/vision.py   | 83 +++++++++++++++++++++++++-
 vllm/multimodal/inputs.py              |  4 +-
 vllm/multimodal/utils.py               | 44 --------------
 8 files changed, 94 insertions(+), 92 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1bde45cb140cb..dd69f6c9a5aff 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,11 +20,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index b74bb3c8a3f88..37b9989e489ec 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,14 +31,13 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   resolve_visual_encoder_outputs)
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8537fec854b6d..76a810e8f0c20 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -66,8 +66,9 @@
 from vllm.transformers_utils.config import uses_mrope
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix)
+from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 7ea177e94afc0..cca42842bc06e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -24,11 +24,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 4ed3b237ae0e2..43b3c973c97b8 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -8,16 +8,12 @@
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
-from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available, print_warning_once
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -612,37 +608,6 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-    """
-    Get the available attention backend for Vision Transformer.
-    """
-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
-    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-    if selected_backend is None:
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-    if selected_backend is None:
-        # For Volta and Turing GPUs, use xformers instead.
-        device_available = current_platform.has_device_capability(80)
-        if device_available and support_fa:
-            from transformers.utils import is_flash_attn_2_available
-            if is_flash_attn_2_available():
-                selected_backend = _Backend.FLASH_ATTN
-            else:
-                print_warning_once(
-                    "Current `vllm-flash-attn` has a bug inside vision module, "
-                    "so we use xformers backend instead. You can run "
-                    "`pip install flash-attn` to use flash-attention backend.")
-                selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu() or current_platform.is_rocm():
-            # ROCM doesn't support xformers
-            selected_backend = _Backend.TORCH_SDPA
-        else:
-            selected_backend = _Backend.XFORMERS
-    return selected_backend
-
-
 def maybe_prefix(prefix: str, name: str) -> str:
     """Add a prefix to a name if the prefix is non-empty.
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 8516c9f7066f7..e6a9e153d9107 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Protocol, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
+import torch
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import print_warning_once
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -60,3 +67,77 @@ def get_vision_encoder_info(
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
+    """
+    Get the available attention backend for Vision Transformer.
+    """
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
+    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+    if selected_backend is None:
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+    if selected_backend is None:
+        # For Volta and Turing GPUs, use xformers instead.
+        device_available = current_platform.has_device_capability(80)
+        if device_available and support_fa:
+            from transformers.utils import is_flash_attn_2_available
+            if is_flash_attn_2_available():
+                selected_backend = _Backend.FLASH_ATTN
+            else:
+                print_warning_once(
+                    "Current `vllm-flash-attn` has a bug inside vision module, "
+                    "so we use xformers backend instead. You can run "
+                    "`pip install flash-attn` to use flash-attention backend.")
+                selected_backend = _Backend.XFORMERS
+        elif current_platform.is_cpu() or current_platform.is_rocm():
+            # ROCM doesn't support xformers
+            selected_backend = _Backend.TORCH_SDPA
+        else:
+            selected_backend = _Backend.XFORMERS
+    return selected_backend
+
+
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs contains a list
+    # of hidden states in the same order as the encoder layers
+    # that produced them.
+    offset = max_possible_layers - len(encoder_outputs)
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d542461874866..8680e4175593b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,6 +99,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
+
+The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
@@ -485,7 +487,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
-A dictionary containing placeholder ranges.
+A dictionary containing placeholder ranges for each modality.
 """
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f4a514ba55d0c..1c6bbf77b926f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -285,49 +284,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    feature_sample_layers: Optional[list[int]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
-    max_possible_layers: int,
-) -> torch.Tensor:
-    """Given the outputs a visual encoder module that may correspond to the
-    output of the last layer, or a list of hidden states to be stacked,
-    handle post normalization and resolve it into a single output tensor.
-
-    Args:
-        encoder_outputs: Output of encoder's last layer or all hidden states.
-        feature_sample_layers: Optional layer indices to grab from the encoder
-            outputs; if provided, encoder outputs must be a list.
-        post_layer_norm: Post norm to apply to the output of the encoder.
-        max_possible_layers: Total layers in the fully loaded visual encoder.
-
-    """
-    if feature_sample_layers is None:
-        if post_layer_norm is not None:
-            return post_layer_norm(encoder_outputs)
-        return encoder_outputs
-
-    # Get the hidden states corresponding to the layer indices.
-    # Negative values are relative to the full visual encoder,
-    # so offset them depending on how many layers were loaded.
-    # NOTE: this assumes that encoder_outputs contains a list
-    # of hidden states in the same order as the encoder layers
-    # that produced them.
-    offset = max_possible_layers - len(encoder_outputs)
-    hs_pool = [
-        encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
-        for layer_idx in feature_sample_layers
-    ]
-
-    # Apply post-norm on the final hidden state if we are using it
-    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
-    if post_layer_norm is not None and uses_last_layer:
-        hs_pool[-1] = post_layer_norm(encoder_outputs)
-    return torch.cat(hs_pool, dim=-1)
-
-
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

From 5984499e473c387c17904aa9933b8ed080621ca6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:14:14 +0800
Subject: [PATCH 044/115] [Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/index.md      | 61 ++++--------------------
 docs/source/api/multimodal/inputs.md     | 49 +++++++++++++++++++
 docs/source/api/multimodal/parse.md      |  9 ++++
 docs/source/api/multimodal/processing.md |  9 ++++
 docs/source/api/multimodal/profiling.md  |  9 ++++
 docs/source/api/multimodal/registry.md   |  9 ++++
 vllm/multimodal/parse.py                 | 31 ++++++++----
 vllm/multimodal/processing.py            | 26 +++++++---
 vllm/multimodal/profiling.py             |  7 ++-
 9 files changed, 139 insertions(+), 71 deletions(-)
 create mode 100644 docs/source/api/multimodal/inputs.md
 create mode 100644 docs/source/api/multimodal/parse.md
 create mode 100644 docs/source/api/multimodal/processing.md
 create mode 100644 docs/source/api/multimodal/profiling.md
 create mode 100644 docs/source/api/multimodal/registry.md

diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 0046b73ea825e..51e24795a34cf 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -2,10 +2,6 @@
 
 # Multi-Modality
 
-```{eval-rst}
-.. currentmodule:: vllm.multimodal
-```
-
 vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-
 ## Module Contents
 
-```{eval-rst}
-.. automodule:: vllm.multimodal
-```
-
-### Registry
-
 ```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 ```
 
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-```
-
-### Base Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.base
-    :members:
-    :show-inheritance:
-```
+## Submodules
 
-### Input Classes
+```{toctree}
+:maxdepth: 1
 
-```{eval-rst}
-.. automodule:: vllm.multimodal.inputs
-    :members:
-    :show-inheritance:
-```
-
-### Audio Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.audio
-    :members:
-    :show-inheritance:
-```
-
-### Image Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
-```
-
-### Video Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.video
-    :members:
-    :show-inheritance:
+inputs
+parse
+processing
+profiling
+registry
 ```
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
new file mode 100644
index 0000000000000..3d89666113229
--- /dev/null
+++ b/docs/source/api/multimodal/inputs.md
@@ -0,0 +1,49 @@
+# Input Definitions
+
+## User-facing inputs
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+
+## Internal data structures
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.inputs.NestedTensors
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
new file mode 100644
index 0000000000000..4676139efe626
--- /dev/null
+++ b/docs/source/api/multimodal/parse.md
@@ -0,0 +1,9 @@
+# Data Parsing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.parse
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
new file mode 100644
index 0000000000000..0d81c8d3966ee
--- /dev/null
+++ b/docs/source/api/multimodal/processing.md
@@ -0,0 +1,9 @@
+# Data Processing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.processing
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
new file mode 100644
index 0000000000000..b455145212202
--- /dev/null
+++ b/docs/source/api/multimodal/profiling.md
@@ -0,0 +1,9 @@
+# Memory Profiling
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.profiling
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
new file mode 100644
index 0000000000000..0737a4385cf32
--- /dev/null
+++ b/docs/source/api/multimodal/registry.md
@@ -0,0 +1,9 @@
+# Registry
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.registry
+    :members:
+    :member-order: bysource
+```
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6be046ba77ca7..ccff0e857eec4 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,14 +13,16 @@
 
 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     NestedTensors, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
 
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """
 
     def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
@@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
 
     def get_count(self) -> int:
         return len(self.data)
@@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]:
         return {}
 
 
-class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
 
     def get_count(self) -> int:
         return len(self.data)
@@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None:
 
 class AudioEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "audio")
 
 
@@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
 class ImageEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "image")
 
 
@@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize:
 
 class VideoEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "video")
 
 
@@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
     """
 
     def get_count(self, modality: str, *, strict: bool = True) -> int:
@@ -226,7 +234,8 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
@@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None:
 
         self.target_sr = target_sr
 
-    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c6a30cacebdd1..07d883d5d7295 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -33,20 +33,24 @@
 
 @dataclass
 class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    """
+
     modality: str
     """The modality for which the replacement is made."""
 
     target: _PromptSeq
-    """The text or token sequence to find and replace."""
+    """The token sequence (or text) to find and replace."""
 
     replacement: Union[Callable[[int], _PromptSeq],
                        _PromptSeq] = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`, output the
-    replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).
 
-    For convenience, you can pass in the replacement instead of a function
-    if it does not depend on the input.
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
     """
 
     def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@@ -132,6 +136,11 @@ def token_ids(self) -> list[int]:
 
 @dataclass
 class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -144,6 +153,7 @@ def __post_init__(self) -> None:
 
     @property
     def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
         target = self._target
 
         return _BoundPromptSequence(
@@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence:
         )
 
     def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
         replacement = self._replacement
         if callable(replacement):
             cache_key = item_idx
@@ -528,7 +542,7 @@ def put(
 
 
 class BaseProcessingInfo:
-    """Base class containing information to perform processing."""
+    """Base class to provide the information necessary for data processing."""
 
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ac3a6bcf3ddd..6f7da1509990f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -19,7 +19,10 @@
 
 @dataclass
 class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
     prompt_text: str
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -47,7 +50,7 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 

From 47de8821d3cdd32fce7df6312318223aee591fd2 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Thu, 9 Jan 2025 02:21:30 +0800
Subject: [PATCH 045/115] [Misc]add some explanations for BlockHashType
 (#11847)

---
 vllm/v1/core/kv_cache_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 84ff48bf428a0..22a5d2fb08a48 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -11,8 +11,10 @@
 
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
-    The reason we keep a tuple of token IDs and extra keys is to make sure
-    no hash collision happens when the hash value is the same.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. But please note that 
+    hash collisions can still theoretically occur, albeit with an extremely 
+    low probability.
     """
     # Hash value of the block in an integer.
     hash_value: int

From 56fe4c297c7d9d872eccc19e3edbf1d75e1a30e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:33:29 -0500
Subject: [PATCH 046/115] [TPU][Quantization] TPU `W8A8` (#11785)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/run-tpu-test.sh                    |  11 +-
 tests/tpu/test_quantization_accuracy.py       |  49 +++++++
 .../schemes/compressed_tensors_w8a8_int8.py   | 105 ++++----------
 .../schemes/compressed_tensors_wNa16.py       |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/kernels/__init__.py   |  74 ----------
 .../{ => mixed_precision}/MPLinearKernel.py   |   0
 .../kernels/mixed_precision/__init__.py       |  74 ++++++++++
 .../kernels/{ => mixed_precision}/exllama.py  |   0
 .../kernels/{ => mixed_precision}/machete.py  |   0
 .../kernels/{ => mixed_precision}/marlin.py   |   0
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  64 +++++++++
 .../kernels/scaled_mm/__init__.py             |  84 +++++++++++
 .../quantization/kernels/scaled_mm/cutlass.py | 134 ++++++++++++++++++
 .../quantization/kernels/scaled_mm/xla.py     | 101 +++++++++++++
 .../layers/quantization/utils/w8a8_utils.py   |  38 -----
 vllm/model_executor/parameter.py              |  13 ++
 vllm/platforms/tpu.py                         |   4 +-
 18 files changed, 565 insertions(+), 190 deletions(-)
 create mode 100644 tests/tpu/test_quantization_accuracy.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/MPLinearKernel.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/exllama.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/machete.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/marlin.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 13605a3e97142..a8f021890f742 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,13 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
new file mode 100644
index 0000000000000..6cd5615c44e1e
--- /dev/null
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    excepted_value: float
+
+    def get_model_args(self) -> str:
+        return (f"pretrained={self.model_name},"
+                "max_model_len=4096,max_num_seqs=32")
+
+
+# NOTE: Accuracy scores measured on GPUs.
+ACCURACY_CONFIGS = [
+    GSM8KAccuracyTestConfig(
+        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        excepted_value=0.76),  # no bias
+    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # so only one of these tests can run in a single call to pytest. As
+    # a follow up, move this into the LM-EVAL section of the CI.
+    # GSM8KAccuracyTestConfig(
+    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
+    #     excepted_value=0.66),  # bias in QKV layers
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6cbc58d61e970..0e3f4731775c5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,14 +1,13 @@
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Set
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from torch.nn import Parameter
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_int8_linear, convert_to_channelwise)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
@@ -18,6 +17,7 @@
 
 
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    _kernel_backends_being_used: Set[str] = set()
 
     def __init__(self, strategy: str, is_static_input_scheme: bool,
                  input_symmetric: bool):
@@ -30,74 +30,25 @@ def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # WEIGHT
-        # Cutlass kernels need transposed weight.
-        weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-
-        # WEIGHT SCALE
-        # Cutlass kernels support only per-tensor and per-channel.
-        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
-        # scales being passed to the kernel), convert to the per-channel case.
-        is_fused_module = len(self.logical_widths) > 1
-        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
-            ws_channelwise = convert_to_channelwise(layer.weight_scale,
-                                                    self.logical_widths)
-            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
-        else:
-            layer.weight_scale = Parameter(layer.weight_scale.data,
-                                           requires_grad=False)
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            if self.input_symmetric:
-                layer.input_scale = Parameter(layer.input_scale.max(),
-                                              requires_grad=False)
-                layer.input_zero_point = None
-            else:
-                # reconstruct the ranges
-                int8_traits = torch.iinfo(torch.int8)
-                azps = layer.input_zero_point.to(dtype=torch.int32)
-                range_max = (layer.input_scale *
-                             (int8_traits.max - azps)).max()
-                range_min = (layer.input_scale *
-                             (int8_traits.min - azps)).min()
-
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
-                layer.input_scale = Parameter(scale, requires_grad=False)
-
-                # AZP loaded as int8 but used as int32
-                azp = (int8_traits.min -
-                       range_min / scale).to(dtype=torch.int32)
-                layer.input_zero_point = Parameter(azp, requires_grad=False)
-
-        else:
-            layer.input_scale = None
-            layer.input_zero_point = None
-
-        # azp_adj is the AZP adjustment term, used to account for weights.
-        # It does not depend on scales or azp, so it is the same for
-        # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
-        if not self.input_symmetric:
-            azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32)
-            if self.is_static_input_scheme:
-                # cutlass_w8a8 requires azp to be folded into azp_adj
-                #  in the per-tensor case
-                azp_adj = layer.input_zero_point * azp_adj
-
-            layer.azp_adj = azp_adj
-        else:
-            layer.azp_adj = None
-
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
+            is_static_input_scheme=self.is_static_input_scheme,
+            input_symmetric=self.input_symmetric)
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW8A8Int8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
         weight = ModelWeightParameter(data=torch.empty(
@@ -140,12 +91,18 @@ def create_weights(self, layer: torch.nn.Module,
                     weight_loader=weight_loader)
                 layer.register_parameter("input_zero_point", input_zero_point)
 
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
-        return apply_int8_linear(input=x,
-                                 weight=layer.weight,
-                                 weight_scale=layer.weight_scale,
-                                 input_scale=layer.input_scale,
-                                 input_zero_point=layer.input_zero_point,
-                                 azp_adj=layer.azp_adj,
-                                 bias=bias)
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a515738017781..2dd243b9c3109 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a006d729cc627..2dbfca9b07690 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,7 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
index 94a3dc2584d6b..e69de29bb2d1d 100644
--- a/vllm/model_executor/layers/quantization/kernels/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/__init__.py
@@ -1,74 +0,0 @@
-from typing import List, Optional, Type
-
-import vllm.envs as envs
-from vllm.model_executor.layers.quantization.kernels.exllama import (
-    ExllamaLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.machete import (
-    MacheteLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.marlin import (
-    MarlinLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.MPLinearKernel import (
-    MPLinearKernel, MPLinearLayerConfig)
-from vllm.platforms import current_platform
-
-# in priority/performance order (when available)
-_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
-    MacheteLinearKernel,
-    MarlinLinearKernel,
-    ExllamaLinearKernel,
-]
-
-
-def choose_mp_linear_kernel(
-        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
-    """
-    Choose an MPLinearKernel that can implement the given config for the given
-     compute capability. Attempts to choose the best kernel in terms of 
-     performance.
-
-    Args:
-        config (MPLinearLayerConfig): Description of the linear layer to be 
-          implemented.
-        compute_capability (Optional[int], optional): The compute capability of
-          the target device, if None uses `current_platform` to get the compute 
-          capability. Defaults to None.
-
-    Raises:
-        ValueError: If no kernel can implement the given config.
-
-    Returns:
-        Type[MPLinearKernel]: Chosen kernel.
-    """
-    if compute_capability is None:
-        if current_platform is None:
-            raise ValueError("Cannot determine compute capability")
-        _cc = current_platform.get_device_capability()
-        compute_capability = _cc[0] * 10 + _cc[1]
-
-    failure_reasons = []
-    for kernel in _POSSIBLE_KERNELS:
-        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
-            failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
-            continue
-
-        if kernel.get_min_capability() > compute_capability:
-            failure_reasons.append(
-                f"{kernel.__name__} requires capability "
-                f"{kernel.get_min_capability()}, current compute capability "
-                f"is {compute_capability}")
-            continue
-
-        can_implement, failure_reason = kernel.can_implement(config)
-        if can_implement:
-            return kernel
-        else:
-            failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
-            )
-
-    raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
new file mode 100644
index 0000000000000..83549870e3f0b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -0,0 +1,74 @@
+from typing import List, Optional, Type
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    MarlinLinearKernel,
+    ExllamaLinearKernel,
+]
+
+
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/exllama.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/machete.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/marlin.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
new file mode 100644
index 0000000000000..75cf91f191136
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -0,0 +1,64 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+
+
+class ScaledMMLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module
+    ) -> Tuple[torch.Tensor,  # weight
+               torch.Tensor,  # weight_scale
+               Optional[torch.Tensor],  # input_scale, 
+               Optional[torch.Tensor],  # input_zp
+               Optional[torch.Tensor],  # azp_adj
+               ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
new file mode 100644
index 0000000000000..586752d3d34e3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -0,0 +1,84 @@
+import os
+from typing import Dict, List, Optional, Type
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+#     TritonScaledMMLinear)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
+    # incorrectly attempt to run AZP models if prompted to.
+    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+
+
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    given compute capability. Attempts to choose the best kernel in terms of 
+    performance.
+
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the 
+            compute capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
new file mode 100644
index 0000000000000..2e83a04286a0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -0,0 +1,134 @@
+from typing import Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
new file mode 100644
index 0000000000000..9de668e658826
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -0,0 +1,101 @@
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index d89071f30a549..7cdce67cf1677 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -201,44 +201,6 @@ def apply_fp8_linear(
             return output.to(dtype=input.dtype).view(*output_shape)
 
 
-def apply_int8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    input_zero_point: Optional[torch.Tensor] = None,
-    azp_adj: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-):
-    # ops.scaled_int8_quant supports both dynamic and static quant.
-    # * dynamic, layer.input_scale is None and x_scale computed from x.
-    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    symmetric = azp_adj is None
-    x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
-                                               input_scale,
-                                               input_zero_point,
-                                               symmetric=symmetric)
-
-    if x_zp is not None:
-        # Currently, static is always per-tensor and dynamic is per-token
-        static = input_zero_point is not None
-        azp = None if static else x_zp
-        return ops.cutlass_scaled_mm_azp(x_q,
-                                         weight,
-                                         scale_a=x_scale,
-                                         scale_b=weight_scale,
-                                         out_dtype=input.dtype,
-                                         azp_adj=azp_adj,
-                                         azp=azp,
-                                         bias=bias)
-    return ops.cutlass_scaled_mm(x_q,
-                                 weight,
-                                 scale_a=x_scale,
-                                 scale_b=weight_scale,
-                                 out_dtype=input.dtype,
-                                 bias=bias)
-
-
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 02d22a5ca62c0..fc5a3e7fba674 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -6,6 +6,7 @@
 
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
+from vllm.model_executor.utils import _make_synced_weight_loader
 
 __all__ = [
     "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
@@ -37,6 +38,18 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
         :returns: a torch.nn.parameter
         """
 
+        # During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu():
+            weight_loader = _make_synced_weight_loader(weight_loader)
+
         self._weight_loader = weight_loader
 
     @property
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 77f5c8401424b..d488daf056f1a 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,7 +19,9 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
-    supported_quantization: list[str] = ["tpu_int8"]
+    supported_quantization: list[str] = [
+        "tpu_int8", "compressed-tensors", "compressed_tensors"
+    ]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:

From 526de822d501c792b051c864ba873a836d78d5bf Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 8 Jan 2025 14:23:15 -0600
Subject: [PATCH 047/115] [Kernel][Triton][AMD] Use block size heuristic for
 avg 2.8x speedup for int8 models (#11698)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 .../compressed_tensors/triton_scaled_mm.py      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 3ff162170f255..2659afcdc74a9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -128,7 +128,8 @@ def triton_scaled_mm(input: torch.Tensor,
                      bias: Optional[torch.Tensor] = None,
                      block_size_m: int = 32,
                      block_size_n: int = 32,
-                     block_size_k: int = 32) -> torch.Tensor:
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
     M, K = input.shape
     N = weight.shape[1]
 
@@ -152,6 +153,20 @@ def triton_scaled_mm(input: torch.Tensor,
 
     has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
 
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
     block_size_sa = 1 if has_scalar(scale_a) else block_size_m
     block_size_sb = 1 if has_scalar(scale_b) else block_size_n
 

From 3db0cafdf1fe7f4cd7e41a145f78e8a568b4d63c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 8 Jan 2025 12:38:28 -0800
Subject: [PATCH 048/115] [Docs] Add Google Cloud Meetup (#11864)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f82229f39537..253a0bb913e37 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+
+---
+
 *Latest News* 🔥
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From 615e4a54017136649db275b68932af80168781f8 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 8 Jan 2025 21:20:44 -0500
Subject: [PATCH 049/115] [CI] Turn on basic correctness tests for V1 (#10864)

---
 tests/basic_correctness/test_basic_correctness.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 1c2193bb17a55..31a101e48e026 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -44,7 +44,6 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])

From 1fe554bac32419a6d64a5c977849806a1efd9725 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 9 Jan 2025 00:05:43 -0300
Subject: [PATCH 050/115] treat do_lower_case in the same way as the
 sentence-transformers library (#11815)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/entrypoints/openai/test_serving_chat.py       | 1 +
 tests/models/embedding/language/test_embedding.py   | 1 +
 vllm/entrypoints/openai/serving_engine.py           | 5 +++++
 vllm/inputs/preprocess.py                           | 6 ++++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 5 -----
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 97248f1150979..f431d1065e0eb 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -35,6 +35,7 @@ class MockModelConfig:
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
+    encoder_config = None
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index f458ef5ef556d..7749806548cd9 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -15,6 +15,7 @@
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
         # [Encoder-decoder]
         pytest.param("intfloat/e5-mistral-7b-instruct",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 319f869240036..88859255f202a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -160,6 +160,11 @@ def _normalize_prompt_text_to_input(
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         if truncate_prompt_tokens is None:
             encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
         else:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 6ddc1eb76f10d..3e92d5821e645 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -190,6 +190,12 @@ def _tokenize_prompt(
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
             add_special_tokens = False
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
                                 lora_request=lora_request,
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index c0b3d2585a962..d400276796996 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -26,11 +26,6 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        trust_remote_code=model_config.trust_remote_code,
                        revision=model_config.tokenizer_revision)
 
-    if (model_config.encoder_config is not None
-            and "do_lower_case" in model_config.encoder_config):
-        init_kwargs["do_lower_case"] = model_config.encoder_config[
-            "do_lower_case"]
-
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)
 

From 730e9592e97c643474aa44e9d3dbe6f55c4b9ad9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 8 Jan 2025 22:37:48 -0500
Subject: [PATCH 051/115] [Doc] Recommend uv and python 3.12 for quickstart
 guide (#11849)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 docs/source/getting_started/quickstart.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 2808e1b386801..ea15d9ef065fa 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -15,10 +15,19 @@ This guide will help you quickly get started with vLLM to perform:
 ## Installation
 
 If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
-It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```console
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+$ uv pip install vllm
+```
+
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.10 -y
+$ conda create -n myenv python=3.12 -y
 $ conda activate myenv
 $ pip install vllm
 ```

From d848800e884f581eeed9f154d6c2aeb38eac24de Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 12:48:12 +0800
Subject: [PATCH 052/115] [Misc] Move `print_*_once` from utils to logger
 (#11298)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml        |  1 +
 vllm/attention/backends/torch_sdpa.py         |  9 ++-
 vllm/attention/backends/xformers.py           |  8 ++-
 vllm/config.py                                |  9 ++-
 vllm/entrypoints/chat_utils.py                |  7 +--
 vllm/inputs/preprocess.py                     | 20 ++++---
 vllm/inputs/registry.py                       |  4 +-
 vllm/logger.py                                | 57 +++++++++++++++++--
 vllm/lora/peft_helper.py                      |  6 +-
 vllm/lora/punica_wrapper/punica_selector.py   |  8 ++-
 vllm/model_executor/custom_op.py              |  3 +-
 .../compressed_tensors_moe.py                 |  8 ++-
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/kv_cache.py           |  6 +-
 .../quantization/utils/marlin_utils_fp8.py    |  6 +-
 .../model_loader/weight_utils.py              |  8 +--
 vllm/model_executor/models/chameleon.py       |  6 +-
 vllm/model_executor/models/olmoe.py           |  6 +-
 vllm/model_executor/models/qwen2_moe.py       |  6 +-
 vllm/model_executor/models/vision.py          |  6 +-
 vllm/utils.py                                 | 12 ----
 21 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ee768db63c96c..556b60d2fca12 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,6 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c14f7754596dd..ca1c4618615de 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,9 +13,12 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.logger import init_logger
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
 
@@ -396,8 +399,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("Torch SPDA does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 694c7cc1bc36a..8c8ca8520a9db 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,7 +17,9 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class XFormersBackend(AttentionBackend):
@@ -385,8 +387,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("XFormers does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 6dabeb3861af2..19609085cc960 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,8 +32,7 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3151,7 +3150,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a492d5496e025..923c7459f6948 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,6 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -985,14 +984,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3e92d5821e645..a738ffe18e3ae 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,6 @@
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            print_warning_once("Using None for decoder start token id because "
-                               "this is not an encoder/decoder model.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            print_warning_once("Using None for decoder start token id because "
-                               "model config is not available.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b22b3f1594f24..aad0dfab94a01 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        print_warning_once, resolve_mm_processor_kwargs)
+                        resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 538db0dcf19aa..cac174f7ba02a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -4,11 +4,12 @@
 import logging
 import os
 import sys
-from functools import partial
+from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Dict, Optional
+from types import MethodType
+from typing import Any, Optional, cast
 
 import vllm.envs as envs
 
@@ -49,8 +50,44 @@
 }
 
 
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
+
+
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+
+
 def _configure_vllm_root_logger() -> None:
-    logging_config: Dict = {}
+    logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
         dictConfig(logging_config)
 
 
-def init_logger(name: str) -> Logger:
+def init_logger(name: str) -> _VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
-    return logging.getLogger(name)
+    logger = logging.getLogger(name)
+
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
 
 
 # The root logger is initialized when the module is imported.
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index ddd42ae93d290..dacfb9ebd1480 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,7 +4,9 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.utils import print_info_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @dataclass
@@ -42,7 +44,7 @@ def _validate_features(self):
     def __post_init__(self):
         self._validate_features()
         if self.use_rslora:
-            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
         else:
             self.vllm_lora_scaling_factor = self.lora_alpha / self.r
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index cd64878d95ae3..9791d492d8e48 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,21 @@
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
+logger = init_logger(__name__)
+
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index fddc8bad09ef5..401606e8c76f0 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5fd6b017f444b..4fb8fd84e92d4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer.")
             layer.w13_input_scale = torch.nn.Parameter(
                 layer.w13_input_scale.max(), requires_grad=False)
             layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2fe22903a385b..a1be45a49e94a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,7 +28,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
+                        "for each layer.")
                 layer.w13_input_scale = torch.nn.Parameter(
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index d79536d196b92..a74f5415c8a51 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,8 +1,10 @@
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class BaseKVCacheMethod(QuantizeMethodBase):
@@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8b3dfaae971c3..245fe9238e421 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -3,11 +3,13 @@
 import torch
 
 import vllm._custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 
+logger = init_logger(__name__)
+
 
 def is_fp8_marlin_supported():
     return current_platform.has_device_capability(80)
@@ -47,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a2c991cfdb74e..11d5fd7135d9e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,7 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import PlaceholderModule, print_warning_once
+from vllm.utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
@@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index acff926891bbe..452fe727875fe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,6 +11,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -35,13 +36,14 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
+logger = init_logger(__name__)
+
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -1111,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 5d9091cfb9311..fbe5d1aee04b3 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -20,6 +20,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -34,13 +35,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class OlmoeMoE(nn.Module):
     """A tensor-parallel MoE implementation for Olmoe that shards each expert
@@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ba70243c6533d..95de6c21871bf 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,6 +34,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -50,13 +51,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e6a9e153d9107..a1395982af44c 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -7,8 +7,10 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
 
@@ -87,7 +89,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/utils.py b/vllm/utils.py
index c09cae70e9af8..a92b77efd9fd8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -696,18 +696,6 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache
-def print_info_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.info(msg, stacklevel=2)
-
-
-@lru_cache
-def print_warning_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.warning(msg, stacklevel=2)
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform

From a732900efc4eb0d4393e3885d5df8ef3516d4834 Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:39:39 +0700
Subject: [PATCH 053/115] [Doc] Intended links Python multiprocessing library
 (#11878)

---
 docs/source/design/multiprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index da87638e5b743..c2cdb75ea08a7 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.

From 310aca88c984983189a57f1b72e3b1dde89fb92f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 15:18:21 +0800
Subject: [PATCH 054/115] [perf]fix current stream (#11870)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../device_communicators/pynccl.py            | 15 +++++----
 vllm/distributed/parallel_state.py            |  5 +--
 vllm/utils.py                                 | 33 +++++++++++++++++++
 vllm/worker/multi_step_model_runner.py        |  8 ++---
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index fda4d007ceb5b..efc59987195f5 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -10,6 +10,7 @@
     ncclRedOpTypeEnum, ncclUniqueId)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
+from vllm.utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -96,7 +97,7 @@ def __init__(
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
 
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
@@ -119,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -141,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -162,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -177,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -189,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -201,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a837c1dc5953b..be7f16ef52a47 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -357,10 +357,7 @@ def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
             return out
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
-        # TODO: pynccl should not use `stream=`
-        # it can just always use the current stream.
-        out = pynccl_comm.all_reduce(input_,
-                                     stream=torch.cuda.current_stream())
+        out = pynccl_comm.all_reduce(input_)
         if out is None:
             # fall back to the default all-reduce using PyTorch.
             # this usually happens during testing.
diff --git a/vllm/utils.py b/vllm/utils.py
index a92b77efd9fd8..0b0905e675245 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -944,6 +944,39 @@ def find_nccl_library() -> str:
     return so_file
 
 
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream = None
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    global _current_stream
+    _current_stream = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    global _current_stream
+    if _current_stream is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _current_stream = torch.cuda.current_stream()
+    return _current_stream
+
+
 def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index a2c2cebf8d1f6..acce923498d7e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -14,7 +14,7 @@
                                                 get_pythonized_sample_results)
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d
+from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -498,7 +498,7 @@ def execute_model(
         #   appended sampler output from last iteration
         #   - also maybe pythonize if CPU is ahead of GPU
 
-        current_stream = torch.cuda.current_stream()
+        stream = current_stream()
         if not model_input.is_first_multi_step:
             # Explicitly block on the previous step's forward to make sure we
             # don't clobber any GPU tensors still in use.
@@ -541,7 +541,7 @@ def execute_model(
                                                        num_steps=1)
 
         # record the event for the current step so that the next step can sync
-        model_input.record_step_event(current_stream)
+        model_input.record_step_event(stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
             assert isinstance(output, list)
@@ -552,7 +552,7 @@ def execute_model(
             # event for the pythonization so that we only pythonize if the
             # tensors are ready. May be able to be combined with the step event
             output_ready_event = torch.cuda.Event()
-            output_ready_event.record(current_stream)
+            output_ready_event.record(stream)
             if self.parallel_config.pipeline_parallel_size > 1:
                 output[0].sampled_token_ids_cpu = output[
                     0].sampled_token_ids.cpu()

From 0bd1ff43469f867f92786a3596c3e4a64df43400 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:02:53 +0800
Subject: [PATCH 055/115] [Bugfix] Override dunder methods of placeholder
 modules (#11882)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_utils.py |  47 ++++++++++-
 vllm/utils.py       | 189 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 220 insertions(+), 16 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0285b00d73be1..14d2fbd63b90d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,9 @@
 import torch
 from vllm_test_utils import monitor
 
-from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
+                        StoreBoolean, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -323,3 +323,44 @@ def measure_current_non_torch():
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    def build_ctx():
+        return pytest.raises(ModuleNotFoundError,
+                             match="No module named")
+
+    with build_ctx():
+        int(placeholder)
+
+    with build_ctx():
+        placeholder()
+
+    with build_ctx():
+        _ = placeholder.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with build_ctx():
+        int(placeholder_attr)
+
+    with build_ctx():
+        placeholder_attr()
+
+    with build_ctx():
+        _ = placeholder_attr.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b0905e675245..487088591ebc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -46,7 +46,7 @@
 import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
-from typing_extensions import ParamSpec, TypeIs, assert_never
+from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -1627,24 +1627,183 @@ def get_vllm_optional_dependencies():
     }
 
 
-@dataclass(frozen=True)
-class PlaceholderModule:
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    :meth:`__getattr__` is not called when they are accessed.
+
+    See also:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
     """
     A placeholder object to use when a module does not exist.
 
     This enables more informative errors when trying to access attributes
     of a module that does not exists.
     """
-    name: str
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
 
     def placeholder_attr(self, attr_path: str):
         return _PlaceholderModuleAttr(self, attr_path)
 
     def __getattr__(self, key: str):
-        name = self.name
+        name = self.__name
 
         try:
-            importlib.import_module(self.name)
+            importlib.import_module(name)
         except ImportError as exc:
             for extra, names in get_vllm_optional_dependencies().items():
                 if name in names:
@@ -1657,17 +1816,21 @@ def __getattr__(self, key: str):
                              "when the original module can be imported")
 
 
-@dataclass(frozen=True)
-class _PlaceholderModuleAttr:
-    module: PlaceholderModule
-    attr_path: str
+class _PlaceholderModuleAttr(_PlaceholderBase):
+
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
 
     def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self.module,
-                                      f"{self.attr_path}.{attr_path}")
+        return _PlaceholderModuleAttr(self.__module,
+                                      f"{self.__attr_path}.{attr_path}")
 
     def __getattr__(self, key: str):
-        getattr(self.module, f"{self.attr_path}.{key}")
+        getattr(self.__module, f"{self.__attr_path}.{key}")
 
         raise AssertionError("PlaceholderModule should not be used "
                              "when the original module can be imported")

From 1d967acb45d5d18434409b822f105f087e379eee Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <ye.charlotte.qi@gmail.com>
Date: Thu, 9 Jan 2025 01:36:39 -0800
Subject: [PATCH 056/115] [Bugfix] fix beam search input errors and latency
 benchmark script (#11875)

Signed-off-by: Ye Qi <yeq@meta.com>
Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com>
---
 benchmarks/benchmark_latency.py | 23 +++++++++++++++++------
 vllm/entrypoints/llm.py         | 10 ++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e669ce4db299d..77c4f6aa927e4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
@@ -49,15 +64,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
+                llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm_generate()
             end_time = time.perf_counter()
             latency = end_time - start_time
             return latency
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e48fd1a4fa5e9..acb4db85632a8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -21,7 +21,7 @@
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
@@ -457,7 +457,7 @@ def generate(
 
     def beam_search(
         self,
-        prompts: List[Union[str, List[int]]],
+        prompts: List[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
     ) -> List[BeamSearchOutput]:
         """
@@ -493,8 +493,10 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         instances: List[BeamSearchInstance] = []
 
         for prompt in prompts:
-            prompt_tokens = prompt if isinstance(
-                prompt, list) else tokenizer.encode(prompt)
+            if is_token_prompt(prompt):
+                prompt_tokens = prompt["prompt_token_ids"]
+            else:
+                prompt_tokens = tokenizer.encode(prompt["prompt"])
             instances.append(BeamSearchInstance(prompt_tokens))
 
         for _ in range(max_tokens):

From 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:43:40 +0800
Subject: [PATCH 057/115] [Doc] Add model development API Reference (#11884)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                      |  2 +-
 docs/source/api/{params.md => inference_params.md} |  5 ++---
 docs/source/api/model/adapters.md                  |  9 +++++++++
 docs/source/api/model/index.md                     | 12 ++++++++++++
 docs/source/api/model/interfaces.md                |  9 +++++++++
 docs/source/api/model/interfaces_base.md           |  9 +++++++++
 docs/source/index.md                               |  3 ++-
 vllm/model_executor/models/interfaces.py           | 11 +++++++----
 vllm/model_executor/models/interfaces_base.py      |  3 +++
 9 files changed, 54 insertions(+), 9 deletions(-)
 rename docs/source/api/{params.md => inference_params.md} (79%)
 create mode 100644 docs/source/api/model/adapters.md
 create mode 100644 docs/source/api/model/index.md
 create mode 100644 docs/source/api/model/interfaces.md
 create mode 100644 docs/source/api/model/interfaces_base.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f883595f6d9ad..e288f8f30159a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/docs/source/api/params.md b/docs/source/api/inference_params.md
similarity index 79%
rename from docs/source/api/params.md
rename to docs/source/api/inference_params.md
index a3b4d9cbb44ec..181c30cab9c4a 100644
--- a/docs/source/api/params.md
+++ b/docs/source/api/inference_params.md
@@ -1,6 +1,6 @@
-# Optional Parameters
+# Inference Parameters
 
-Optional parameters for vLLM APIs.
+Inference parameters for vLLM APIs.
 
 (sampling-params)=
 
@@ -19,4 +19,3 @@ Optional parameters for vLLM APIs.
 .. autoclass:: vllm.PoolingParams
     :members:
 ```
-
diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
new file mode 100644
index 0000000000000..e103a51d0070d
--- /dev/null
+++ b/docs/source/api/model/adapters.md
@@ -0,0 +1,9 @@
+# Model Adapters
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.adapters
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
new file mode 100644
index 0000000000000..b8437e3c3517a
--- /dev/null
+++ b/docs/source/api/model/index.md
@@ -0,0 +1,12 @@
+# Model Development
+
+## Submodules
+
+```{toctree}
+:maxdepth: 1
+
+interfaces_base
+interfaces
+adapters
+```
+
diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
new file mode 100644
index 0000000000000..55bee57f64faa
--- /dev/null
+++ b/docs/source/api/model/interfaces.md
@@ -0,0 +1,9 @@
+# Optional Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
new file mode 100644
index 0000000000000..75d58d34228e9
--- /dev/null
+++ b/docs/source/api/model/interfaces_base.md
@@ -0,0 +1,9 @@
+# Base Model Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces_base
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/index.md b/docs/source/index.md
index 6747a7fcce4fe..23e4304fe29d9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -139,8 +139,9 @@ community/sponsors
 
 api/offline_inference/index
 api/engine/index
+api/inference_params
 api/multimodal/index
-api/params
+api/model/index
 ```
 
 % Design Documents: Details about vLLM internals
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f26603046483..b51cba86ec1a4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -38,13 +38,15 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
         to be merged with text embeddings.
 
         The output embeddings must be one of the following formats:
+    
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input multimodal data item (e.g, image).
+            each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
-        NOTE: The returned multimodal embeddings must be in the same order as 
-        the appearances of their corresponding multimodal data item in the 
-        input prompt.
+        Note:
+            The returned multimodal embeddings must be in the same order as 
+            the appearances of their corresponding multimodal data item in the 
+            input prompt.
         """
         ...
 
@@ -59,6 +61,7 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         ...
 
+    @overload
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index de733b6d49a53..4c353ae6ffc13 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -35,6 +35,7 @@
 
 @runtime_checkable
 class VllmModel(Protocol[C_co, T_co]):
+    """The interface required for all models in vLLM."""
 
     def __init__(
         self,
@@ -97,6 +98,7 @@ def is_vllm_model(
 
 @runtime_checkable
 class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all generative models in vLLM."""
 
     def compute_logits(
         self,
@@ -142,6 +144,7 @@ def is_text_generation_model(
 
 @runtime_checkable
 class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all pooling models in vLLM."""
 
     def pooler(
         self,

From 405eb8e3967eb9bd263b3919796cb3b45a2931d3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 9 Jan 2025 21:46:50 +0800
Subject: [PATCH 058/115] [platform] Allow platform specify attention backend
 (#11609)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py |  74 ++++++------
 vllm/attention/selector.py               | 139 ++---------------------
 vllm/platforms/cpu.py                    |   7 +-
 vllm/platforms/cuda.py                   |  77 ++++++++++++-
 vllm/platforms/hpu.py                    |   7 +-
 vllm/platforms/interface.py              |   8 +-
 vllm/platforms/openvino.py               |   7 +-
 vllm/platforms/rocm.py                   |   6 +-
 vllm/platforms/tpu.py                    |   7 +-
 vllm/platforms/xpu.py                    |   7 +-
 10 files changed, 164 insertions(+), 175 deletions(-)

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 916cc2efa3895..a08c874407e3f 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,10 +1,10 @@
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm.attention.selector import which_attn_to_use
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
@@ -12,6 +12,13 @@
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
@@ -24,67 +31,70 @@ def test_env(name: str, device: str, monkeypatch):
 
     if device == "cpu":
         with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "TORCH_SDPA"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "TORCH_SDPA"
     elif device == "hip":
         with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "ROCM_FLASH"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "OPENVINO"
+                   OpenVinoPlatform()), patch.dict('sys.modules',
+                                                   {'openvino': Mock()}):
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == name
+        if name in ["XFORMERS", "FLASHINFER"]:
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == name
 
 
 def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # which_attn_to_use
+    # get_attn_backend
 
     override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
-    backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
-    backend = which_attn_to_use(16, torch.float16, "fp8", 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
-    backend = which_attn_to_use(16, torch.float16, None, 8, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, None, 8, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
-    backend = which_attn_to_use(17, torch.float16, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(17, torch.float16, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Attention-free models should bypass env and use PlaceholderAttention
-    backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
-        which_attn_to_use(16, torch.float16, None, 16, False)
+        get_attn_backend(16, torch.float16, None, 16, False)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index d263839705690..0ff007c87b1c9 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -9,7 +9,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
+from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -114,83 +114,19 @@ def _cached_get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
-                                is_attention_free, use_v1)
-    if backend == _Backend.FLASH_ATTN:
-        logger.info("Using Flash Attention backend.")
-        from vllm.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend)
-        return FlashAttentionBackend
-    if backend == _Backend.FLASH_ATTN_VLLM_V1:
-        from vllm.v1.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend as FlashAttentionBackendV1)
-        return FlashAttentionBackendV1
-    if backend == _Backend.XFORMERS:
-        logger.info("Using XFormers backend.")
-        from vllm.attention.backends.xformers import (  # noqa: F401
-            XFormersBackend)
-        return XFormersBackend
-    elif backend == _Backend.ROCM_FLASH:
-        logger.info("Using ROCmFlashAttention backend.")
-        from vllm.attention.backends.rocm_flash_attn import (  # noqa: F401
-            ROCmFlashAttentionBackend)
-        return ROCmFlashAttentionBackend
-    elif backend == _Backend.TORCH_SDPA:
-        assert current_platform.is_cpu(), RuntimeError(
-            "Torch SDPA backend is only used for the CPU device.")
-        logger.info("Using Torch SDPA backend.")
-        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
-        return TorchSDPABackend
-    elif backend == _Backend.OPENVINO:
-        logger.info("Using OpenVINO Attention backend.")
-        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
-        return OpenVINOAttentionBackend
-    elif backend == _Backend.IPEX:
-        assert current_platform.is_xpu(), RuntimeError(
-            "IPEX attention backend is only used for the XPU device.")
-        logger.info("Using IPEX attention backend.")
-        from vllm.attention.backends.ipex_attn import IpexAttnBackend
-        return IpexAttnBackend
-    elif backend == _Backend.FLASHINFER:
-        logger.info("Using Flashinfer backend.")
-        from vllm.attention.backends.flashinfer import FlashInferBackend
-        return FlashInferBackend
-    elif backend == _Backend.HPU_ATTN:
-        logger.info("Using HPUAttention backend.")
-        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
-        return HPUAttentionBackend
-    elif backend == _Backend.PALLAS:
-        logger.info("Using Pallas backend.")
-        from vllm.attention.backends.pallas import PallasAttentionBackend
-        return PallasAttentionBackend
-    elif backend == _Backend.NO_ATTENTION:
-        from vllm.attention.backends.placeholder_attn import (
-            PlaceholderAttentionBackend)
-        return PlaceholderAttentionBackend
-    else:
-        raise ValueError("Invalid attention backend.")
-
-
-def which_attn_to_use(head_size: int,
-                      dtype: torch.dtype,
-                      kv_cache_dtype: Optional[str],
-                      block_size: int,
-                      is_attention_free: bool,
-                      use_v1: bool = False) -> _Backend:
-    """Returns which flash attention backend to use."""
-    # Default case.
-    selected_backend = _Backend.FLASH_ATTN
-
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
     if is_attention_free:
-        return _Backend.NO_ATTENTION
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
 
     # Check whether a particular choice of backend was
     # previously forced.
     #
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
+    selected_backend = None
     backend_by_global_setting: Optional[_Backend] = (
         get_global_forced_attn_backend())
     if backend_by_global_setting is not None:
@@ -201,64 +137,13 @@ def which_attn_to_use(head_size: int,
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    # get device-specific default attn_backend
-    default_backend = current_platform.get_default_attn_backend(
-        selected_backend)
-    if default_backend is not None:
-        return default_backend
-
-    if use_v1:
-        return _Backend.FLASH_ATTN_VLLM_V1
-
-    # FlashAttn in NVIDIA GPUs.
-    if selected_backend == _Backend.FLASH_ATTN:
-        if not current_platform.has_device_capability(80):
-            # Volta and Turing NVIDIA GPUs.
-            logger.info(
-                "Cannot use FlashAttention-2 backend for Volta and Turing "
-                "GPUs.")
-            selected_backend = _Backend.XFORMERS
-        elif dtype not in (torch.float16, torch.bfloat16):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for dtype other than "
-                "torch.float16 or torch.bfloat16.")
-            selected_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            selected_backend = _Backend.XFORMERS
-        elif block_size % 16 != 0:
-            logger.info(
-                "Cannot use FlashAttention-2 backend for block size not "
-                "divisible by 16.")
-            selected_backend = _Backend.XFORMERS
-
-    # FlashAttn is valid for the model, checking if the package is installed.
-    if selected_backend == _Backend.FLASH_ATTN:
-        try:
-            import vllm.vllm_flash_attn  # noqa: F401
-            from vllm.attention.backends.flash_attn import (  # noqa: F401
-                FlashAttentionBackend)
-
-            supported_sizes = FlashAttentionBackend.get_supported_head_sizes()
-            if head_size not in supported_sizes:
-                logger.info(
-                    "Cannot use FlashAttention-2 backend for head size %d.",
-                    head_size)
-                selected_backend = _Backend.XFORMERS
-        except ImportError:
-            logger.info(
-                "Cannot use FlashAttention-2 backend because the "
-                "vllm.vllm_flash_attn package is not found. "
-                "Make sure that vllm_flash_attn was built and installed "
-                "(on by default).")
-            selected_backend = _Backend.XFORMERS
-
-    return selected_backend
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)
 
 
 @contextmanager
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 7ba7f5150150c..eb3e269cac285 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -28,10 +28,13 @@ def get_device_name(cls, device_id: int = 0) -> str:
         return "cpu"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
-        return _Backend.TORCH_SDPA
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3c5350b778345..23ceac83e49de 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -16,7 +16,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -141,6 +141,81 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif kv_cache_dtype is not None and \
+            kv_cache_dtype.startswith("fp8"):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0a44f2b74163a..8152d881fa8d9 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -21,8 +21,11 @@ class HpuPlatform(Platform):
     dispatch_key: str = "HPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
-        return _Backend.HPU_ATTN
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ddccaa2ce0148..f440358f65fbb 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -112,9 +112,11 @@ def is_cuda_alike(self) -> bool:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend):
-        """Get the default attention backend of a device."""
-        return None
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
 
     @classmethod
     def get_device_capability(
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 16eb8dc81efc2..9390eda535c8f 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -28,10 +28,13 @@ class OpenVinoPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        return _Backend.OPENVINO
+        logger.info("Using OpenVINO Attention backend.")
+        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index aa779f265135f..1c2f602efc856 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,8 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
@@ -79,7 +80,8 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
                 logger.info("flash_attn is not supported on NAVI GPUs.")
         else:
             logger.info("%s is not supported in AMD GPUs.", selected_backend)
-        return _Backend.ROCM_FLASH
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
 
     @classmethod
     @lru_cache(maxsize=8)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d488daf056f1a..8a59b53ca4b15 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,10 +24,13 @@ class TpuPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
-        return _Backend.PALLAS
+        logger.info("Using Pallas backend.")
+        return "vllm.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 78e17c2afec65..00692a5d23031 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -21,10 +21,13 @@ class XPUPlatform(Platform):
     dispatch_key: str = "XPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
-        return _Backend.IPEX
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
     @staticmethod
     def get_device_capability(device_id: int = 0) -> DeviceCapability:

From bd8287221187279c668ac10c3edd5242b8d8b429 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 22:47:29 +0800
Subject: [PATCH 059/115] [ci]try to fix flaky multi-step tests (#11894)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/multi_step/test_correctness_async_llm.py | 3 +--
 tests/utils.py                                 | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 7203d635c2fa8..8456a463adeeb 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,6 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--disable-log-requests",
     "--worker-use-ray",
     "--gpu-memory-utilization",
     "0.85",
@@ -110,7 +109,7 @@ async def test_multi_step(
 
     # Spin up client/server & issue completion API requests.
     # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
+    # was raised 5x to 1200 *just for this test* due to
     # observed timeouts in GHA CI
     ref_completions = await completions_with_server_args(
         prompts,
diff --git a/tests/utils.py b/tests/utils.py
index bf3d88194e4ca..f4eecf19e8c64 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -157,13 +157,19 @@ def url_root(self) -> str:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def get_client(self):
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.OpenAI(
             base_url=self.url_for("v1"),
             api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
         )
 
     def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.AsyncOpenAI(base_url=self.url_for("v1"),
                                   api_key=self.DUMMY_API_KEY,
                                   max_retries=0,
@@ -780,7 +786,6 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    max_wait_seconds = 240 * 3  # 240 is default
     with RemoteOpenAIServer(model_name,
                             server_cli_args,
                             max_wait_seconds=max_wait_seconds) as server:

From 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 01:19:37 +0800
Subject: [PATCH 060/115] [Misc] Provide correct Pixtral-HF chat template
 (#11891)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 61 ++++++++++++++------------
 examples/template_pixtral_hf.jinja     | 38 ++++++++++++++++
 tests/entrypoints/test_chat_utils.py   |  1 +
 3 files changed, 73 insertions(+), 27 deletions(-)
 create mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3ba34c77205e5..acbe27a22a679 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2ForCausalLM`
-  - Qwen2
+  - QwQ, Qwen2
   - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
   - ✅︎
   - ✅︎
@@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
@@ -550,6 +550,28 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+Offline inference:
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
+
+Online inference:
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
+
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
+
 ### Generative Models
 
 See [this page](#generative-models) for more information on how to use generative models.
@@ -689,14 +711,14 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `Phi3VForCausalLM`
   - Phi-3-Vision, Phi-3.5-Vision
   - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
   -
   - ✅︎
   - ✅︎
 * - `PixtralForConditionalGeneration`
   - Pixtral
   - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
   -
   - ✅︎
   - ✅︎
@@ -715,7 +737,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL
+  - QVQ, Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
@@ -733,26 +755,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
-
-```python
-llm = LLM(
-    model="Qwen/Qwen2-VL-7B-Instruct",
-    limit_mm_per_prompt={"image": 4},
-)
-```
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
-```
-````
-
-```{note}
-vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
@@ -762,6 +764,11 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
+```{note}
+The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
+A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+```
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
new file mode 100644
index 0000000000000..e94661cb39071
--- /dev/null
+++ b/examples/template_pixtral_hf.jinja
@@ -0,0 +1,38 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message["role"] == "user" %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "\n" }}
+        {%- else %}
+            {{- "[INST]" }}
+        {%- endif %}
+        {%- if message["content"] is not string %}
+            {%- for chunk in message["content"] %}
+                {%- if chunk["type"] == "text" %}
+                    {{- chunk["text"] }}
+                {%- elif chunk["type"] == "image" %}
+                    {{- "[IMG]" }}
+                {%- else %}
+                    {{- raise_exception("Unrecognized content type!") }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message["content"] }}
+        {%- endif %}
+        {{- "[/INST]" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- message["content"] + eos_token}}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index d63b963522e73..8f242df4a60e3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -758,6 +758,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),

From 36f5303578397d122693a19007be38ba2f02bcbc Mon Sep 17 00:00:00 2001
From: Charles Frye <cfrye59@gmail.com>
Date: Thu, 9 Jan 2025 15:26:37 -0800
Subject: [PATCH 061/115] [Docs] Add Modal to deployment frameworks (#11907)

---
 docs/source/deployment/frameworks/bentoml.md | 2 +-
 docs/source/deployment/frameworks/index.md   | 1 +
 docs/source/deployment/frameworks/modal.md   | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/deployment/frameworks/modal.md

diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/source/deployment/frameworks/bentoml.md
index ea0b5d1d4c93b..2bf435bda8380 100644
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -2,6 +2,6 @@
 
 # BentoML
 
-[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
 For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 6a59131d36618..964782763f6b3 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -8,6 +8,7 @@ cerebrium
 dstack
 helm
 lws
+modal
 skypilot
 triton
 ```
diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md
new file mode 100644
index 0000000000000..e7c42088e36a9
--- /dev/null
+++ b/docs/source/deployment/frameworks/modal.md
@@ -0,0 +1,7 @@
+(deployment-modal)=
+
+# Modal
+
+vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
+
+For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).

From c3cf54dda4df200bc8913ed69d210a7108dfa320 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:10:12 +0800
Subject: [PATCH 062/115] [Doc][5/N] Move Community and API Reference to the
 bottom (#11896)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 README.md                                     |  2 +-
 .../source/design/automatic_prefix_caching.md |  2 +-
 docs/source/index.md                          | 62 ++++++++++++-------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 253a0bb913e37..67c557bfe13a9 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 4398536b2b4ad..6d3dd056e6a60 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -2,7 +2,7 @@
 
 # Automatic Prefix Caching
 
-The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 23e4304fe29d9..356fa4b7fd573 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -26,7 +26,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
@@ -54,6 +54,8 @@ For more information, check out the following:
 
 ## Documentation
 
+% How to start using vLLM?
+
 ```{toctree}
 :caption: Getting Started
 :maxdepth: 1
@@ -65,6 +67,8 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
+% What does vLLM support?
+
 ```{toctree}
 :caption: Models
 :maxdepth: 1
@@ -75,6 +79,8 @@ models/supported_models
 models/extensions/index
 ```
 
+% Additional capabilities
+
 ```{toctree}
 :caption: Features
 :maxdepth: 1
@@ -89,6 +95,8 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+% Details about running vLLM
+
 ```{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@@ -104,6 +112,8 @@ serving/usage_stats
 serving/integrations/index
 ```
 
+% Scaling up vLLM for production
+
 ```{toctree}
 :caption: Deployment
 :maxdepth: 1
@@ -115,6 +125,8 @@ deployment/frameworks/index
 deployment/integrations/index
 ```
 
+% Making the most out of vLLM
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
@@ -123,28 +135,7 @@ performance/optimization
 performance/benchmarks
 ```
 
-% Community: User community resources
-
-```{toctree}
-:caption: Community
-:maxdepth: 1
-
-community/meetups
-community/sponsors
-```
-
-```{toctree}
-:caption: API Reference
-:maxdepth: 2
-
-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
-```
-
-% Design Documents: Details about vLLM internals
+% Explanation of vLLM internals
 
 ```{toctree}
 :caption: Design Documents
@@ -159,7 +150,7 @@ design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% Developer Guide: How to contribute to the vLLM project
+% How to contribute to the vLLM project
 
 ```{toctree}
 :caption: Developer Guide
@@ -172,6 +163,29 @@ contributing/model/index
 contributing/vulnerability_management
 ```
 
+% Technical API specifications
+
+```{toctree}
+:caption: API Reference
+:maxdepth: 2
+
+api/offline_inference/index
+api/engine/index
+api/inference_params
+api/multimodal/index
+api/model/index
+```
+
+% Latest news and acknowledgements
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
 # Indices and tables
 
 - {ref}`genindex`

From b844b99ad309b05f37b1acb5360c82be7b16281d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:24:00 +0800
Subject: [PATCH 063/115] [VLM] Enable tokenized inputs for merged multi-modal
 processor (#11900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py      |  31 ++++--
 vllm/inputs/data.py                      |   4 +-
 vllm/inputs/preprocess.py                |   4 -
 vllm/model_executor/models/blip2.py      |  22 +++-
 vllm/model_executor/models/chameleon.py  |  32 +++++-
 vllm/model_executor/models/fuyu.py       |  24 +++--
 vllm/model_executor/models/interfaces.py |   8 +-
 vllm/model_executor/models/llava.py      |   8 +-
 vllm/model_executor/models/phi3v.py      |   4 +-
 vllm/model_executor/models/ultravox.py   |  18 ++--
 vllm/multimodal/processing.py            | 127 ++++++++++++++++-------
 vllm/multimodal/profiling.py             |   2 +-
 12 files changed, 207 insertions(+), 77 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d98bd9736b65f..d18909a4197b6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -649,7 +649,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-def _test_processing_cache_correctness(
+def _test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -691,6 +691,7 @@ def _test_processing_cache_correctness(
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
 
     rng = np.random.RandomState(0)
 
@@ -747,7 +748,25 @@ def _test_processing_cache_correctness(
         )
 
         assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {mm_data=})")
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
 
 # yapf: disable
@@ -771,14 +790,14 @@ def _test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness(
+def test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
@@ -795,7 +814,7 @@ def test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness_phi3v(
+def test_processing_correctness_phi3v(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -809,7 +828,7 @@ def test_processing_cache_correctness_phi3v(
 
     AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
 
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index cdaf6dd76eaa1..b8163a7acde1d 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -44,13 +44,13 @@ class TokensPrompt(TypedDict):
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
-    DEPRECATED: Optional multi-modal data to pass to the model,
+    Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
-    DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the
+    Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
     have registered mappers etc for the model being considered, we attempt
     to pass the mm_processor_kwargs to each of them.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a738ffe18e3ae..0890883cc984f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -279,10 +279,6 @@ async def _process_multimodal_async(
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
-        if isinstance(prompt, list):
-            logger.warning("Passing `multi_modal_data` in TokensPrompt is"
-                           "deprecated and will be removed in a future update")
-            prompt = tokenizer.decode(prompt)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7dfc0b687c6e3..917b88e802071 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -441,6 +441,24 @@ def get_dummy_processor_inputs(
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # HF processor always adds placeholders even when there's no image
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -469,11 +487,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 452fe727875fe..a6634204699c9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -99,6 +99,34 @@ def get_dummy_processor_inputs(
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds sep token for chat mode
+        tokenizer = self.info.get_tokenizer()
+        sep_token_id: int = \
+            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
+
+        return prompt_tokens + [sep_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -128,11 +156,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the image_start_token and image_end_token
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 59af5f0b3ae98..63e7147f84e03 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,7 @@
 """ PyTorch Fuyu model."""
 import math
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -149,14 +149,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         if not mm_data:
             # Avoid warning from HF logger for text-only input
-            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
-            # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self.info.get_tokenizer()
-            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
-            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
@@ -181,6 +177,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds boa_token_id
+        tokenizer = self.info.get_tokenizer()
+        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+
+        return prompt_tokens + [boa_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -223,11 +229,11 @@ def get_replacement_fuyu(item_idx: int):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b51cba86ec1a4..c5fd0d9332379 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,13 +39,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
     
-        - A list or tuple of 2D tensors, where each tensor corresponds to 
-            each input multimodal data item (e.g, image).
+        - A list or tuple of 2D tensors, where each tensor corresponds to
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
         Note:
-            The returned multimodal embeddings must be in the same order as 
-            the appearances of their corresponding multimodal data item in the 
+            The returned multimodal embeddings must be in the same order as
+            the appearances of their corresponding multimodal data item in the
             input prompt.
         """
         ...
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8d94acf3b21d5..bb3db60c7d8ed 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -724,7 +724,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -737,7 +737,7 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
@@ -760,7 +760,7 @@ def get_replacement_mantis(item_idx: int):
             )
         ])
 
-        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+        prompt_ids, prompt, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
             mantis_mm_repls,
             mm_item_counts,
@@ -788,7 +788,7 @@ def get_replacement_mantis(item_idx: int):
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholder_ranges,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a1b1af35604db..7a230e5beb367 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -481,11 +481,11 @@ def _apply_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index fada22d685dd6..3edfb5107683a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -138,12 +138,8 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self.info.get_tokenizer()
-
-            prompt_ids = tokenizer.encode(
-                prompt,
-                add_special_tokens=False,  # type: ignore
-            )
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         mm_data = dict(mm_data)
@@ -188,6 +184,16 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor omits bos_token_id by setting add_special_tokens=False
+        tokenizer = self.info.get_tokenizer()
+        assert prompt_tokens[0] == tokenizer.bos_token_id
+
+        return prompt_tokens[1:]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 07d883d5d7295..8b47dfb07387f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -725,15 +725,15 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
-    def _apply_hf_processor(
+    def _apply_hf_processor_text_mm(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Wrapper of :meth:`_call_hf_processor` that applies
-        additional pre-processing and post-processing.
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -753,40 +753,93 @@ def _apply_hf_processor(
 
         return prompt_ids, mm_kwargs
 
-    def _apply_hf_processor_missing(
-        self,
-        prompt_text: str,
-        mm_missing_data_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ):
+    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
-        Apply the HF processor on the full prompt text, but only on the
-        multi-modal data that are missing from the cache.
+        Apply the HF processor on the prompt text only.
 
-        Note:
-            We pass prompt text and multi-modal data into the HF processor
-            in separate calls to avoid HF prompt replacement being done for
-            cached items; instead, we rely on our own prompt replacement logic
-            (:meth:`_get_prompt_replacements`) for the full text.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
         """
-        mm_missing_counts = mm_missing_data_items.get_all_counts()
-
-        prompt_ids, _ = self._apply_hf_processor(
+        prompt_ids, _ = self._apply_hf_processor_text_mm(
             prompt_text=prompt_text,
             mm_items=MultiModalDataItems({}),
             hf_processor_mm_kwargs={},
         )
 
-        # Some HF processors (e.g. Qwen2-VL) expect corresponding
-        # multi-modal tokens to be in the prompt text
+        return prompt_ids
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of :meth:`_apply_hf_processor_text_only` on the
+        corresponding text.
+        """
+        return prompt_tokens
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+
         dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
             self.info.ctx.model_config.max_model_len,
-            mm_missing_counts,
+            mm_counts,
         )
 
-        _, mm_missing_kwargs = self._apply_hf_processor(
+        _, mm_kwargs = self._apply_hf_processor_text_mm(
             prompt_text=dummy_inputs.prompt_text,
-            mm_items=mm_missing_data_items,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_replacement: bool,
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+
+        Note:
+            If :code:`enable_hf_prompt_replacement=False`, the prompt should
+            correspond to the multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_replacement:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+
+            prompt_ids = self._apply_hf_processor_text_only(prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
@@ -794,7 +847,7 @@ def _apply_hf_processor_missing(
 
     def _cached_apply_hf_processor(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
@@ -807,10 +860,11 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            return self._apply_hf_processor(
-                prompt_text=prompt_text,
+            return self._apply_hf_processor_main(
+                prompt=prompt,
                 mm_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_replacement=True,
             )
 
         mm_maybe_cached_kw_items = {
@@ -832,10 +886,13 @@ def _cached_apply_hf_processor(
         }
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
-        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
-            prompt_text=prompt_text,
-            mm_missing_data_items=mm_missing_data_items,
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we need to pass `enable_hf_prompt_replacement=False`
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=False,
         )
 
         mm_missing_next_idx = {
@@ -1018,7 +1075,7 @@ def _validate_mm_placeholders(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -1056,7 +1113,7 @@ def apply(
             mm_hashes = None
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
-            prompt_text,
+            prompt,
             mm_items,
             hf_processor_mm_kwargs,
         )
@@ -1101,12 +1158,12 @@ def apply(
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self.info.get_tokenizer()
-            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
-                prompt_text,
+                prompt,
                 missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
@@ -1125,7 +1182,7 @@ def apply(
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 6f7da1509990f..ec580cd6ecddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -137,7 +137,7 @@ def _get_dummy_mm_inputs(
             seq_len, mm_counts)
 
         return self.processor.apply(
-            prompt_text=processor_inputs.prompt_text,
+            prompt=processor_inputs.prompt_text,
             mm_data=processor_inputs.mm_data,
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )

From 3de2b1eafb12e420c563cb7153d4d2f0e8451ca9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:25:20 +0800
Subject: [PATCH 064/115] [Doc] Show default pooling method in a table (#11904)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/generative_models.md |  8 ++--
 docs/source/models/pooling_models.md    | 59 +++++++++++++++++--------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6228c7c2ac957..a9f74c4d3fbb8 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
 
+For generative models, the only supported `--task` option is `"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
 See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
-For generative models, the only supported {code}`task` option is {code}`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
 ### `LLM.generate`
 
 The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
@@ -33,7 +33,7 @@ for output in outputs:
 ```
 
 You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting {code}`temperature=0`:
+For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
 llm = LLM(model="facebook/opt-125m")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 3e4407cfdc233..745f3fd81980d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -14,30 +14,53 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 ```
 
-## Offline Inference
-
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
-
-For pooling models, we support the following {code}`task` options:
-
-- Embedding ({code}`"embed"` / {code}`"embedding"`)
-- Classification ({code}`"classify"`)
-- Sentence Pair Scoring ({code}`"score"`)
-- Reward Modeling ({code}`"reward"`)
+For pooling models, we support the following `--task` options.
+The selected option sets the default pooler used to extract the final hidden states:
+
+```{list-table}
+:widths: 50 25 25 25
+:header-rows: 1
+
+* - Task
+  - Pooling Type
+  - Normalization
+  - Softmax
+* - Embedding (`embed`)
+  - `LAST`
+  - ✅︎
+  - ✗
+* - Classification (`classify`)
+  - `LAST`
+  - ✗
+  - ✅︎
+* - Sentence Pair Scoring (`score`)
+  - \*
+  - \*
+  - \*
+* - Reward Modeling (`reward`)
+  - `ALL`
+  - ✗
+  - ✗
+```
 
-The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+\*The default pooler is always defined by the model.
 
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
+```{note}
+If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
+```
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-You can customize the model's pooling method via the {code}`override_pooler_config` option,
+```{tip}
+You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
 ### `LLM.encode`
 

From cf5f000d218fbcbc4bf404de8ed9d9607a128c3b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 10 Jan 2025 13:14:42 +0800
Subject: [PATCH 065/115] [torch.compile] Hide KV cache behind torch.compile
 boundary (#11677)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py | 18 +++--
 tests/test_utils.py                        | 85 +++++++++++++++++++++-
 tests/v1/engine/test_engine_core.py        |  3 +
 tests/v1/engine/test_engine_core_client.py |  3 +
 vllm/attention/layer.py                    | 29 +++++---
 vllm/config.py                             |  1 -
 vllm/forward_context.py                    | 33 +++++----
 vllm/utils.py                              | 35 +++++++++
 vllm/v1/worker/gpu_model_runner.py         |  6 +-
 vllm/worker/cpu_enc_dec_model_runner.py    |  3 +-
 vllm/worker/cpu_model_runner.py            |  3 +-
 vllm/worker/cpu_pooling_model_runner.py    |  3 +-
 vllm/worker/cpu_worker.py                  |  4 +-
 vllm/worker/enc_dec_model_runner.py        |  3 +-
 vllm/worker/model_runner.py                |  5 +-
 vllm/worker/pooling_model_runner.py        |  3 +-
 vllm/worker/worker.py                      |  4 +-
 vllm/worker/worker_base.py                 |  1 +
 18 files changed, 198 insertions(+), 44 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 614674375786e..e008a56de6208 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -142,12 +142,18 @@ class that Attention will automatically select when it is constructed.
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
-    kv_cache = make_kv_cache(test_pt.num_blocks,
-                             test_pt.num_heads,
-                             test_pt.head_size,
-                             test_pt.block_size,
-                             device=CUDA_DEVICE,
-                             backend=test_pt.backend_name)
+    if test_pt.attn_type in (AttentionType.DECODER,
+                             AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(test_pt.num_blocks,
+                                 test_pt.num_heads,
+                                 test_pt.head_size,
+                                 test_pt.block_size,
+                                 device=CUDA_DEVICE,
+                                 backend=test_pt.backend_name)
+    else:
+        kv_cache = torch.tensor([])
+
+    attn.kv_cache = [kv_cache]
     return TestResources(scale, attn, kv_cache)
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 14d2fbd63b90d..6810e0302f897 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,11 @@
 import torch
 from vllm_test_utils import monitor
 
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, deprecate_kwargs, get_open_port,
-                        memory_profiling, merge_async_iterators, supports_kw)
+                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -325,6 +327,85 @@ def measure_current_non_torch():
     lib.cudaFree(handle2)
 
 
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+
+
+def test_bind_kv_cache_encoder_decoder():
+    from vllm.attention import Attention, AttentionType
+
+    # example from bart
+    ctx = {
+        'encoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+        'decoder.layers.0.encoder_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+        'decoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+    }
+
+    kv_cache = [
+        torch.zeros((1, )),
+    ]
+    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+
+
+def test_bind_kv_cache_pp():
+    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with set_current_vllm_config(cfg):
+        from vllm.attention import Attention
+
+        ctx = {
+            'layers.0.self_attn': Attention(32, 128, 0.1),
+        }
+        kv_cache = [
+            [torch.zeros((1, ))],
+            [torch.zeros((1, ))]
+        ]
+        bind_kv_cache(ctx, kv_cache)
+        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
+        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8dd9b23fbdd5f..5b1732036e807 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -4,6 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -36,6 +37,7 @@ def make_request() -> EngineCoreRequest:
     )
 
 
+@fork_new_process_for_each_test
 def test_engine_core(monkeypatch):
 
     with monkeypatch.context() as m:
@@ -138,6 +140,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 0
 
 
+@fork_new_process_for_each_test
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 5a21806e57a11..7eac16f2cf542 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,6 +6,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -75,6 +76,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
             break
 
 
+@fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
@@ -143,6 +145,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         client.abort_requests([request.request_id])
 
 
+@fork_new_process_for_each_test
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1b3598e60b54..55e4e14027f79 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -121,6 +121,13 @@ def __init__(
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
 
     def forward(
         self,
@@ -148,11 +155,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, self.layer_name)
+                query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, self.layer_name)
+                                                    self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -230,12 +237,12 @@ def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(query, key, value, kv_cache, attn_metadata,
                              self._k_scale, self._v_scale)
 
@@ -244,7 +251,6 @@ def unified_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -253,7 +259,7 @@ def unified_attention_fake(
 direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
@@ -264,12 +270,12 @@ def unified_attention_with_output(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(query,
                       key,
                       value,
@@ -285,7 +291,6 @@ def unified_attention_with_output_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     return
@@ -294,7 +299,7 @@ def unified_attention_with_output_fake(
 direct_register_custom_op(
     op_name="unified_attention_with_output",
     op_func=unified_attention_with_output,
-    mutates_args=["kv_cache", "output"],
+    mutates_args=["output"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/config.py b/vllm/config.py
index 19609085cc960..13b5390008a35 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2780,7 +2780,6 @@ def model_post_init(self, __context: Any) -> None:
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
-    # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7f56575279e9b..828b394ec5d21 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
@@ -10,6 +10,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -21,9 +24,12 @@
 
 @dataclass
 class ForwardContext:
-    static_forward_context: Dict[str, Any]
+    # copy from vllm_config.compilation_config.static_forward_context
+    attn_layers: Dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
-    dynamic_forward_context: Any
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -38,34 +44,35 @@ def get_forward_context() -> ForwardContext:
 
 
 @contextmanager
-def set_forward_context(context: Any, vllm_config: VllmConfig):
+def set_forward_context(attn_metadata: Any,
+                        vllm_config: VllmConfig,
+                        virtual_engine: int = 0):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
     global forward_start_time
-    need_to_track_batchsize = track_batchsize and context is not None
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        static_forward_context=vllm_config.compilation_config.
-        static_forward_context,
-        dynamic_forward_context=context)
+        attn_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata)
     try:
         yield
     finally:
-        global batchsize_counter
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(context, "num_prefill_tokens"):
+            if hasattr(attn_metadata, "num_prefill_tokens"):
                 # for v0 attention backends
-                batchsize = context.num_prefill_tokens + \
-                    context.num_decode_tokens
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
             else:
                 # for v1 attention backends
-                batchsize = context.num_input_tokens
+                batchsize = attn_metadata.num_input_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/utils.py b/vllm/utils.py
index 487088591ebc2..8c3e5200b3d98 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2138,3 +2138,38 @@ def get_mp_context():
     _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
+
+
+def bind_kv_cache(
+        ctx: Dict[str, Any],
+        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
+) -> None:
+    # Bind the kv_cache tensor to Attention modules, similar to
+    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
+    # Special things handled here:
+    # 1. Some models have non-attention layers, e.g., Jamba
+    # 2. Pipeline parallelism, each rank only has a subset of layers
+    # 3. Encoder attention has no kv cache
+    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
+    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
+    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
+    #    tensor
+    from vllm.attention import AttentionType
+    from vllm.model_executor.models.utils import extract_layer_index
+    layer_need_kv_cache = [
+        layer_name for layer_name in ctx
+        if ctx[layer_name].attn_type in (AttentionType.DECODER,
+                                         AttentionType.ENCODER_DECODER)
+    ]
+    layer_index_sorted = sorted(
+        set(
+            extract_layer_index(layer_name)
+            for layer_name in layer_need_kv_cache))
+    for layer_name in layer_need_kv_cache:
+        kv_cache_idx = layer_index_sorted.index(
+            extract_layer_index(layer_name))
+        forward_ctx = ctx[layer_name]
+        assert len(forward_ctx.kv_cache) == len(kv_cache)
+        for ve, ve_kv_cache in enumerate(kv_cache):
+            assert forward_ctx.kv_cache[ve].numel() == 0
+            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a1d4f9b135789..fb87dc5a8222a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -16,7 +16,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, bind_kv_cache, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
@@ -860,3 +861,6 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [self.kv_caches])
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index cc24cfe04d2ba..fa6775cbd6c66 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -305,7 +305,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f1531e0fc0675..d99db4e0c6c40 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -526,7 +526,8 @@ def execute_model(
             execute_model_kwargs.update(
                 {"previous_hidden_states": previous_hidden_states})
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 17b2fd2564a04..d31ba89e12375 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -69,7 +69,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b5dfebfce6f75..494c6506f3c0f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
@@ -293,6 +293,8 @@ def _init_cache_engine(self) -> None:
             self.cache_engine[ve].cpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.cpu_cache)
         self.model_runner.block_size = self.cache_engine[0].block_size
 
         assert all(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 4d5d918087be8..8a161b740042d 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -175,7 +175,8 @@ def execute_model(
         } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1c6d1bbee78ee..2b918483d3675 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1527,7 +1527,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self._update_inputs_to_capture_for_enc_dec_model(
                             capture_inputs)
 
-                    with set_forward_context(attn_metadata, self.vllm_config):
+                    with set_forward_context(attn_metadata, self.vllm_config,
+                                             virtual_engine):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
@@ -1695,7 +1696,7 @@ def execute_model(
 
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config):
+                                     self.vllm_config, virtual_engine):
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f79b3773bcbd2..6de227f3cb2b9 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -105,7 +105,8 @@ def execute_model(
         if model_input.token_types is not None:
             cross_enc_kwargs["token_type_ids"] = model_input.token_types
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f51b51d433d3d..0f12549e3f3fd 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, memory_profiling
+from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -285,6 +285,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 249b3ed2dfd37..a835718e1db19 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -43,6 +43,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
         from vllm.platforms import current_platform
         self.current_platform = current_platform
 

From ac2f3f7fee93cf9cd97c0078e362feab7b6c8299 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Fri, 10 Jan 2025 00:56:36 -0700
Subject: [PATCH 066/115] [Bugfix] Validate lora adapters to avoid crashing
 server (#11727)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  | 269 ++++++++++++++++++
 tests/entrypoints/openai/test_lora_lineage.py | 109 -------
 tests/entrypoints/openai/test_serving_chat.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |  10 +-
 tests/entrypoints/openai/test_shutdown.py     |  27 +-
 vllm/engine/async_llm_engine.py               |   4 +
 vllm/engine/multiprocessing/__init__.py       |  20 +-
 vllm/engine/multiprocessing/client.py         |  42 ++-
 vllm/engine/multiprocessing/engine.py         |  27 +-
 vllm/engine/protocol.py                       |   5 +
 vllm/entrypoints/openai/api_server.py         |   7 +-
 vllm/entrypoints/openai/run_batch.py          |   1 +
 vllm/entrypoints/openai/serving_models.py     |  78 +++--
 vllm/lora/worker_manager.py                   |  19 +-
 vllm/v1/engine/async_llm.py                   |   4 +
 15 files changed, 459 insertions(+), 171 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_lora_adapters.py
 delete mode 100644 tests/entrypoints/openai/test_lora_lineage.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
new file mode 100644
index 0000000000000..46a064f6d9e68
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,269 @@
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
+                                              tmp_path, zephyr_lora_files):
+    invalid_rank = tmp_path / "invalid_rank"
+
+    # Copy adapter from zephyr_lora_files to invalid_rank
+    shutil.copytree(zephyr_lora_files, invalid_rank)
+
+    with open(invalid_rank / "adapter_config.json") as f:
+        adapter_config = json.load(f)
+
+    print(adapter_config)
+
+    # assert False
+
+    # Change rank to invalid value
+    adapter_config["r"] = 1024
+    with open(invalid_rank / "adapter_config.json", "w") as f:
+        json.dump(adapter_config, f)
+
+    with pytest.raises(openai.BadRequestError,
+                       match="is greater than max_lora_rank"):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_rank)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
deleted file mode 100644
index ce4f85c13fff9..0000000000000
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import json
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
-    # Define the json format LoRA module configurations
-    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    lora_module_2 = {
-        "name": "zephyr-lora2",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        json.dumps(lora_module_1),
-        json.dumps(lora_module_2),
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "64",
-    ]
-
-    # Enable the /v1/load_lora_adapter endpoint
-    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
-
-    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client_for_lora_lineage(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert served_model.root == MODEL_NAME
-    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
-    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-async def test_dynamic_lora_lineage(
-        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
-
-    response = await client_for_lora_lineage.post("load_lora_adapter",
-                                                  cast_to=str,
-                                                  body={
-                                                      "lora_name":
-                                                      "zephyr-lora-3",
-                                                      "lora_path":
-                                                      zephyr_lora_files
-                                                  })
-    # Ensure adapter loads before querying /models
-    assert "success" in response
-
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
-    assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index f431d1065e0eb..85f485364a411 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -52,7 +52,7 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
-    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
                                            models,
@@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
@@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 96897dc730da2..657ea20213ec9 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,6 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
@@ -21,13 +22,16 @@
 
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
+                                         base_model_paths=BASE_MODEL_PATHS,
                                          model_config=mock_model_config,
                                          lora_modules=None,
                                          prompt_adapters=None)
+    await serving_models.init_static_loras()
 
     return serving_models
 
@@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found():
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.type == "NotFoundError"
+    assert response.code == HTTPStatus.NOT_FOUND
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 6fcc92022855b..090523a836e12 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,6 +1,3 @@
-import json
-import os
-
 import openai
 import pytest
 
@@ -10,16 +7,7 @@
 
 
 @pytest.mark.asyncio
-async def test_shutdown_on_engine_failure(tmp_path):
-    # Use a bad adapter to crash the engine
-    # (This test will fail when that bug is fixed)
-    adapter_path = tmp_path / "bad_adapter"
-    os.mkdir(adapter_path)
-    with open(adapter_path / "adapter_model_config.json", "w") as f:
-        json.dump({"not": "real"}, f)
-    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
-        f.write(b"this is fake")
-
+async def test_shutdown_on_engine_failure():
     # dtype, max-len etc set so that this can run in CI
     args = [
         "--dtype",
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
         "--enforce-eager",
         "--max-num-seqs",
         "128",
-        "--enable-lora",
-        "--lora-modules",
-        f"bad-adapter={tmp_path / 'bad_adapter'}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
 
             with pytest.raises(
                 (openai.APIConnectionError, openai.InternalServerError)):
-                # This crashes the engine
-                await client.completions.create(model="bad-adapter",
-                                                prompt="Hello, my name is")
+                # Asking for lots of prompt logprobs will currently crash the
+                # engine. This may change in the future when that bug is fixed
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
 
             # Now the server should shut down
             return_code = remote_server.proc.wait(timeout=8)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 66a5089074ff5..da23ed19ef7be 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1257,6 +1257,10 @@ async def stop_profile(self) -> None:
         else:
             self.engine.model_executor._run_workers("stop_profile")
 
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        self.engine.add_lora(lora_request)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.VLLM_USE_V1:
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 420f540d0b5f4..7132f9840001a 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+import uuid
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
@@ -120,10 +121,23 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
+@dataclass
+class RPCLoadAdapterRequest:
+    lora_request: LoRARequest
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCAdapterLoadedResponse:
+    request_id: str
+
+
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest]
+                      RPCUProfileRequest, RPCLoadAdapterRequest]
 
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCError]
+REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
+                          RPCError]
 
 
 def ENGINE_DEAD_ERROR(
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0a046c71e86e8..a9ab899535180 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -25,8 +25,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -240,17 +242,22 @@ async def run_output_handler_loop(self):
                         queue = self.output_queues.get(request_id)
                         if queue is not None:
                             queue.put_nowait(exception)
+                # Put each output into the appropriate queue.
+                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
+                    self._add_output(request_outputs)
                 else:
-                    # Put each output into the appropriate steam.
                     for request_output in request_outputs:
-                        queue = self.output_queues.get(
-                            request_output.request_id)
-                        if queue is not None:
-                            queue.put_nowait(request_output)
+                        self._add_output(request_output)
 
         except asyncio.CancelledError:
             logger.debug("Shutting down MQLLMEngineClient output handler.")
 
+    def _add_output(self, request_output: Union[RequestOutput,
+                                                RPCAdapterLoadedResponse]):
+        queue = self.output_queues.get(request_output.request_id)
+        if queue is not None:
+            queue.put_nowait(request_output)
+
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
@@ -659,3 +666,24 @@ async def stop_profile(self) -> None:
 
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        # Uses the same I/O as generate requests
+        request = RPCLoadAdapterRequest(lora_request)
+
+        # Create output queue for this requests.
+        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        # Send the request
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        # Wait for the response
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        # Raise on error, otherwise happily return None
+        if isinstance(request_output, BaseException):
+            raise request_output
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49a90b321dac4..36f4df4b02731 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -14,8 +14,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
 from vllm.executor.gpu_executor import GPUExecutor
@@ -234,6 +236,8 @@ def handle_new_input(self):
                         self.start_profile()
                     else:
                         self.stop_profile()
+                elif isinstance(request, RPCLoadAdapterRequest):
+                    self._handle_load_adapter_request(request)
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -284,6 +288,19 @@ def _handle_abort_request(self, request: RPCAbortRequest):
         if self.log_requests:
             logger.info("Aborted request %s.", request.request_id)
 
+    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
+        try:
+            self.engine.add_lora(request.lora_request)
+        except BaseException as e:
+            # Send back an error if the adater fails to load
+            rpc_err = RPCError(request_id=request.request_id,
+                               is_engine_errored=False,
+                               exception=e)
+            self._send_outputs(rpc_err)
+        # Otherwise, send back the successful load message
+        self._send_outputs(
+            RPCAdapterLoadedResponse(request_id=request.request_id))
+
     def _health_check(self):
         # Send unhealthy if engine has already errored
         if self._errored_with is not None:
@@ -296,7 +313,11 @@ def _health_check(self):
             self._send_unhealthy(e)
 
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send List of RequestOutput to RPCClient."""
+        """Send outputs back to the engine client. These can be:
+        - Exceptions
+        - A list of generation outputs
+        - A response from loading a lora adapter
+        """
         if outputs:
             try:
                 from ray.exceptions import RayTaskError
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a066836b92708..f05ff62c4766b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -270,3 +270,8 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
+
+    @abstractmethod
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        ...
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc1471e1f534d..925d7db43138b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -662,7 +662,7 @@ async def add_request_id(request: Request, call_next):
     return app
 
 
-def init_app_state(
+async def init_app_state(
     engine_client: EngineClient,
     model_config: ModelConfig,
     state: State,
@@ -690,12 +690,13 @@ def init_app_state(
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
     state.openai_serving_models = OpenAIServingModels(
+        engine_client=engine_client,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
     )
-    # TODO: The chat template is now broken for lora adapters :(
+    await state.openai_serving_models.init_static_loras()
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -794,7 +795,7 @@ def signal_handler(*_) -> None:
         app = build_app(args)
 
         model_config = await engine_client.get_model_config()
-        init_app_state(engine_client, model_config, app.state, args)
+        await init_app_state(engine_client, model_config, app.state, args)
 
         shutdown_task = await serve_http(
             app,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 822c0f5f7c211..f8f136f9d5024 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -215,6 +215,7 @@ async def main(args):
 
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
+        engine_client=engine,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=None,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 26966896bc272..a222eafadcb68 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,15 +5,19 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               UnloadLoraAdapterRequest)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class BaseModelPath:
@@ -45,6 +49,7 @@ class OpenAIServingModels:
 
     def __init__(
         self,
+        engine_client: EngineClient,
         model_config: ModelConfig,
         base_model_paths: List[BaseModelPath],
         *,
@@ -55,20 +60,11 @@ def __init__(
 
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
+        self.engine_client = engine_client
 
+        self.static_lora_modules = lora_modules
+        self.lora_requests: List[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self.is_base_model(lora.base_model_name) else
-                            self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
 
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
@@ -84,6 +80,19 @@ def __init__(
                         prompt_adapter_local_path=prompt_adapter.local_path,
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
+    async def init_static_loras(self):
+        """Loads all static LoRA modules.
+        Raises if any fail to load"""
+        if self.static_lora_modules is None:
+            return
+        for lora in self.static_lora_modules:
+            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
+                                                  lora_name=lora.name)
+            load_result = await self.load_lora_adapter(
+                request=load_request, base_model_name=lora.base_model_name)
+            if isinstance(load_result, ErrorResponse):
+                raise ValueError(load_result.message)
+
     def is_base_model(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
 
@@ -129,17 +138,47 @@ async def show_available_models(self) -> ModelList:
 
     async def load_lora_adapter(
             self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+            request: LoadLoraAdapterRequest,
+            base_model_name: Optional[str] = None
+    ) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_load_lora_adapter_request(request)
         if error_check_ret is not None:
             return error_check_ret
 
         lora_name, lora_path = request.lora_name, request.lora_path
         unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
+        lora_request = LoRARequest(lora_name=lora_name,
+                                   lora_int_id=unique_id,
+                                   lora_path=lora_path)
+        if base_model_name is not None and self.is_base_model(base_model_name):
+            lora_request.base_model_name = base_model_name
+
+        # Validate that the adapter can be loaded into the engine
+        # This will also pre-load it for incoming requests
+        try:
+            await self.engine_client.add_lora(lora_request)
+        except ValueError as e:
+            # Adapter not found or lora configuration errors
+            if "No adapter found" in str(e):
+                return create_error_response(message=str(e),
+                                             err_type="NotFoundError",
+                                             status_code=HTTPStatus.NOT_FOUND)
+            else:
+                return create_error_response(
+                    message=str(e),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+        except BaseException as e:
+            # Some other unexpected problem loading the adapter, e.g. malformed
+            # input files.
+            # More detailed error messages for the user would be nicer here
+            return create_error_response(message=str(e),
+                                         err_type="BadRequestError",
+                                         status_code=HTTPStatus.BAD_REQUEST)
+
+        self.lora_requests.append(lora_request)
+        logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
+                    lora_path)
         return f"Success: LoRA adapter '{lora_name}' added successfully."
 
     async def unload_lora_adapter(
@@ -155,6 +194,7 @@ async def unload_lora_adapter(
             lora_request for lora_request in self.lora_requests
             if lora_request.lora_name != lora_name
         ]
+        logger.info("Removed LoRA adapter: name '%s'", lora_name)
         return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
@@ -195,8 +235,8 @@ async def _check_unload_lora_adapter_request(
             return create_error_response(
                 message=
                 f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                err_type="NotFoundError",
+                status_code=HTTPStatus.NOT_FOUND)
 
         return None
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 10976fac23028..eec462743fe9d 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -115,6 +115,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 embedding_padding_modules=self.embedding_padding_modules,
                 weights_mapper=hf_to_vllm_mapper)
 
+        except FileNotFoundError as e:
+            # FileNotFoundError should be raised if both
+            # - No adapter found to download from huggingface (or in
+            #       offline mode)
+            # - No local adapter files found at `lora_request.lora_path`
+            raise ValueError(
+                f"Loading lora {lora_request.lora_name} failed: No adapter "
+                f"found for {lora_path}") from e
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
@@ -209,12 +217,19 @@ def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_adapters():
-            # Remove before we load the new lora to save memory
+            # Load the new adapter first to ensure it is actually valid, before
+            # evicting any existing adapters.
+            # This may cause the # of loaded lora adapters to very temporarily
+            # exceed `--max-cpu-loras`.
+            lora = self._load_adapter(lora_request)
+
+            # Loading succeeded, now check if we will exceed cache capacity and
+            # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                 assert isinstance(self._adapter_manager,
                                   LRUCacheLoRAModelManager)
                 self._adapter_manager.remove_oldest_adapter()
-            lora = self._load_adapter(lora_request)
+            # Then add the new adapter to the cache
             loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b963ba74f13f0..5daae45dee85c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -339,3 +339,7 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        raise NotImplementedError("LoRA not yet supported in V1")

From 61af6332565d0093855fee7266699e548b1c0d1c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 10 Jan 2025 16:20:46 +0800
Subject: [PATCH 067/115] [BUGFIX] Fix `UnspecifiedPlatform` package name
 (#11916)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index f6ac14446c021..6ca95b41dbb07 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -179,7 +179,7 @@ def resolve_current_platform_cls_qualname() -> str:
         logger.info("Automatically detected platform %s.",
                     activated_builtin_plugins[0])
     else:
-        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
         logger.info(
             "No platform detected, vLLM is running on UnspecifiedPlatform")
     return platform_cls_qualname

From d53575a5f0e5c0f9003b4ec6e33c8bf761e93cef Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 16:25:17 +0800
Subject: [PATCH 068/115] [ci] fix gh200 tests (#11919)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 11d5fd7135d9e..9cfcdbf620d2b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -29,7 +29,9 @@
 
 try:
     from runai_model_streamer import SafetensorsStreamer
-except ImportError:
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
     runai_model_streamer = PlaceholderModule(
         "runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(

From d907be7dc7926e64d6240bf4425d7399eaed150e Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:18:25 +0800
Subject: [PATCH 069/115] [misc] remove python function call for custom
 activation op (#11885)

Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                      | 27 --------
 vllm/model_executor/layers/activation.py | 79 ++++++++++++++----------
 2 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index afb350591e562..d04cbbc0a9eed 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -34,33 +34,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-# activation ops
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_and_mul(out, x)
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_tanh_and_mul(out, x)
-
-
-def fatrelu_and_mul(out: torch.Tensor,
-                    x: torch.Tensor,
-                    threshold: float = 0.0) -> None:
-    torch.ops._C.fatrelu_and_mul(out, x, threshold)
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_fast(out, x)
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_new(out, x)
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_quick(out, x)
-
-
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 32456fee06a28..2475190d197d3 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,6 +30,8 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.fatrelu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -39,12 +41,10 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return x1 * x2
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
+        self.op(out, x, self.threshold)
         return out
 
 
@@ -103,6 +103,17 @@ def __init__(self, approximate: str = "none"):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            if approximate == "none":
+                self.op = torch.ops._C.gelu_and_mul
+            elif approximate == "tanh":
+                self.op = torch.ops._C.gelu_tanh_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            if approximate == "none":
+                self.op = ipex_ops.gelu_and_mul
+            else:
+                self.op = ipex_ops.gelu_tanh_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
@@ -110,27 +121,17 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def extra_repr(self) -> str:
@@ -140,6 +141,14 @@ def extra_repr(self) -> str:
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_new
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_new
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
@@ -147,58 +156,62 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
                                            (x + 0.044715 * torch.pow(x, 3.0))))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_new(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_new(x)
+        return self.op(x)
 
 
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_fast
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_fast
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                            (1.0 + 0.044715 * x * x)))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_fast(x)
+        return self.op(x)
 
 
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_quick
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_quick
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return x * torch.sigmoid(1.702 * x)
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     # TODO implement forward_xpu for QuickGELU

From ef725feafcc1f2d6763cc888751fb2b36840587b Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 18:02:38 +0800
Subject: [PATCH 070/115] [platform] support pytorch custom op pluggable
 (#11328)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/custom_op.py | 7 +++++++
 vllm/platforms/interface.py      | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 401606e8c76f0..96995c56bf504 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -57,6 +57,11 @@ def forward_hpu(self, *args, **kwargs):
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_oot(self, *args, **kwargs):
+        # By default, we assume that OOT ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
@@ -81,6 +86,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_out_of_tree():
+            return self.forward_oot
         else:
             return self.forward_cuda
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f440358f65fbb..01d753408e6d0 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -45,6 +45,7 @@ class PlatformEnum(enum.Enum):
     CPU = enum.auto()
     NEURON = enum.auto()
     OPENVINO = enum.auto()
+    OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -107,6 +108,9 @@ def is_neuron(self) -> bool:
     def is_openvino(self) -> bool:
         return self._enum == PlatformEnum.OPENVINO
 
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
     def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

From d85c47d6ad24c286ae55fd9da231808b8ddd7a7f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:05:56 +0000
Subject: [PATCH 071/115] Replace "online inference" with "online serving"
 (#11923)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                                    | 2 +-
 docs/source/features/structured_outputs.md                    | 4 ++--
 docs/source/getting_started/installation/hpu-gaudi.md         | 4 ++--
 docs/source/getting_started/quickstart.md                     | 2 +-
 docs/source/models/generative_models.md                       | 2 +-
 docs/source/models/pooling_models.md                          | 2 +-
 docs/source/models/supported_models.md                        | 4 ++--
 docs/source/serving/multimodal_inputs.md                      | 2 +-
 .../openai_chat_completion_client_for_multimodal.py           | 4 ++--
 tests/models/decoder_only/audio_language/test_ultravox.py     | 4 ++--
 vllm/model_executor/models/molmo.py                           | 2 +-
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 1a4dae8f65e99..5a285be039393 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -61,7 +61,7 @@ function cpu_tests() {
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
-  # online inference
+  # online serving
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index ccd9a6a1b1a14..a42c3dd64ad10 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -5,7 +5,7 @@
 vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
-## Online Inference (OpenAI API)
+## Online Serving (OpenAI API)
 
 You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
 
@@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are:
 - `backend`
 - `whitespace_pattern`
 
-These parameters can be used in the same way as the parameters from the Online Inference examples above.
+These parameters can be used in the same way as the parameters from the Online Serving examples above.
 One example for the usage of the `choices` parameter is shown below:
 
 ```python
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 1d50cef3bdc83..21822327c8822 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -83,7 +83,7 @@ $ python setup.py develop
 ## Supported Features
 
 - [Offline inference](#offline-inference)
-- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
@@ -385,5 +385,5 @@ the below:
   completely. With HPU Graphs disabled, you are trading latency and
   throughput at lower batches for potentially higher throughput on
   higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online inference), or by passing `enforce_eager=True`
+  server (for online serving), or by passing `enforce_eager=True`
   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ea15d9ef065fa..d7d43785c6c24 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -5,7 +5,7 @@
 This guide will help you quickly get started with vLLM to perform:
 
 - [Offline batched inference](#quickstart-offline)
-- [Online inference using OpenAI-compatible server](#quickstart-online)
+- [Online serving using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index a9f74c4d3fbb8..6a5a58ad74ab7 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template)
 outputs = llm.chat(conversation, chat_template=custom_template)
 ```
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 745f3fd81980d..324b1f550e694 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -127,7 +127,7 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index acbe27a22a679..72910ea1e2d19 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -552,7 +552,7 @@ See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the mod
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
 ```python
@@ -562,7 +562,7 @@ llm = LLM(
 )
 ```
 
-Online inference:
+Online serving:
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 9f5e1b908d786..7e96ed46f2dcc 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -199,7 +199,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-## Online Inference
+## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 213d075542e81..03cc037bb6779 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,5 +1,5 @@
 """An example showing how to use vLLM to serve multimodal models 
-and run online inference with OpenAI client.
+and run online serving with OpenAI client.
 
 Launch the vLLM server with the following command:
 
@@ -309,7 +309,7 @@ def main(args) -> None:
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Demo on using OpenAI client for online inference with '
+        description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
     parser.add_argument('--chat-type',
                         '-c',
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 0bb98df1b58e6..1e329dc4cb22e 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 
 
 @pytest.mark.asyncio
-async def test_online_inference(client, audio_assets):
-    """Exercises online inference with/without chunked prefill enabled."""
+async def test_online_serving(client, audio_assets):
+    """Exercises online serving with/without chunked prefill enabled."""
 
     messages = [{
         "role":
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 2e60bc719f096..c45ee9b921c9e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1068,7 +1068,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
         trust_remote_code=model_config.trust_remote_code)
 
     # NOTE: message formatting for raw text prompt is only applied for
-    # offline inference; for online inference, the prompt is always in
+    # offline inference; for online serving, the prompt is always in
     # instruction format and tokenized.
     if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
                                        prompt):

From 241ad7b301facac0728e2b3312d71fe47acc8c9e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 20:45:33 +0800
Subject: [PATCH 072/115] [ci] Fix sampler tests (#11922)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml |  1 +
 tests/conftest.py             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e288f8f30159a..7d13269540864 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -214,6 +214,7 @@ steps:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
   - tests/samplers
+  - tests/conftest.py
   commands:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/tests/conftest.py b/tests/conftest.py
index 917151ddcb8d4..95af4ac1eb17b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -28,12 +28,13 @@
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                         TokensPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity)
+                        identity, is_list_of)
 
 logger = init_logger(__name__)
 
@@ -886,6 +887,12 @@ def generate_beam_search(
         beam_width: int,
         max_tokens: int,
     ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
         outputs = self.model.beam_search(
             prompts,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))

From 12664ddda522b3a22c5b71eca9b2c907e3a687b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 22:30:25 +0800
Subject: [PATCH 073/115] [Doc] [1/N] Initial guide for merged multi-modal
 processor (#11925)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/requirements-docs.txt                    |   1 +
 docs/source/api/multimodal/index.md           |   2 +-
 docs/source/api/multimodal/inputs.md          |   2 +-
 docs/source/conf.py                           |   1 +
 docs/source/contributing/model/index.md       |   2 +-
 docs/source/contributing/model/multimodal.md  | 380 +++++++++++++++---
 .../source/contributing/model/registration.md |   2 +-
 .../input_processing_pipeline.md              |  19 -
 .../input_processing/model_inputs_index.md    |  43 --
 docs/source/design/mm_processing.md           |  64 +++
 docs/source/index.md                          |   2 +-
 docs/source/serving/multimodal_inputs.md      |   2 +-
 vllm/config.py                                |   3 +-
 vllm/inputs/__init__.py                       |   3 -
 vllm/inputs/registry.py                       |  12 +-
 vllm/multimodal/__init__.py                   |   4 +-
 vllm/multimodal/base.py                       |  14 -
 vllm/multimodal/inputs.py                     |   3 +-
 vllm/multimodal/registry.py                   |  12 +-
 19 files changed, 403 insertions(+), 168 deletions(-)
 delete mode 100644 docs/source/design/input_processing/input_processing_pipeline.md
 delete mode 100644 docs/source/design/input_processing/model_inputs_index.md
 create mode 100644 docs/source/design/mm_processing.md

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 64cf6ef8fc19d..8217bc3ba3ded 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 51e24795a34cf..14efdb506d76f 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -7,7 +7,7 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
 
 ## Module Contents
 
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
index 3d89666113229..76b2fb95a5009 100644
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@@ -3,7 +3,7 @@
 ## User-facing inputs
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
+.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
 ```
 
 ## Internal data structures
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1ce11fe057071..bff0141ffbce8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,7 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_design",
     "sphinx_togglebutton",
 ]
 myst_enable_extensions = [
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index a2d601c83cf47..245e13b795ec4 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -2,7 +2,7 @@
 
 # Adding a New Model
 
-This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
 ```{toctree}
 :caption: Contents
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5dcd1223b361..76ab73e43d24b 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -1,6 +1,6 @@
-(enabling-multimodal-inputs)=
+(supports-multimodal)=
 
-# Enabling Multimodal Inputs
+# Multi-Modal Support
 
 This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
 
@@ -37,103 +37,355 @@ Further update the model as follows:
     ) -> SamplerOutput:
   ```
 
-## 2. Register input mappers
+## 2. Specify processing information
 
-For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
-This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`.
+Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
+to provide basic information related to HF processing.
 
-```diff
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-+ from vllm.multimodal import MULTIMODAL_REGISTRY
+### Maximum number of input items
 
-+ @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
+to return the maximum number of input items for each modality supported by the model.
+
+For example, if the model supports any number of images but only one video per prompt:
+
+```python
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    return {"image": None, "video": 1}
 ```
 
-A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+### Maximum number of placeholder feature tokens
+
+Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
+to return the maximum number of placeholder feature tokens per input item for each modality.
+
+When calling the model, the output embeddings from the visual encoder are assigned to the input positions
+containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
+to the size of the output embeddings.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the code of HF's `LlavaForConditionalGeneration`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+n_image_features = image_features.shape[0] * image_features.shape[1]
+
+if n_image_tokens != n_image_features:
+    raise ValueError(
+        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+    )
+special_image_mask = (
+    (input_ids == self.config.image_token_index)
+    .unsqueeze(-1)
+    .expand_as(inputs_embeds)
+    .to(inputs_embeds.device)
+)
+image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+```
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+The number of placeholder feature tokens per image is `image_features.shape[1]`.
+`image_features` is calculated inside the `get_image_features` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+if vision_feature_select_strategy == "default":
+    selected_image_feature = selected_image_feature[:, 1:]
+elif vision_feature_select_strategy == "full":
+    selected_image_feature = selected_image_feature
+else:
+    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+image_features = self.multi_modal_projector(selected_image_feature)
+return image_features
 ```
 
-## 3. Register maximum number of multi-modal tokens
+We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
+(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
+Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
+The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
+mechanism doesn't change the sequence length of the output hidden states.
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
+hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+hidden_states = self.pre_layrnorm(hidden_states)
+
+encoder_outputs = self.encoder(
+    inputs_embeds=hidden_states,
+    output_attentions=output_attentions,
+    output_hidden_states=output_hidden_states,
+    return_dict=return_dict,
+)
+```
 
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
-and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+target_dtype = self.patch_embedding.weight.dtype
+patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+if interpolate_pos_encoding:
+    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+else:
+    embeddings = embeddings + self.position_embedding(self.position_ids)
+return embeddings
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+We can infer that `embeddings.shape[1] == self.num_positions`, where
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-+ @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
+self.num_patches = (self.image_size // self.patch_size) ** 2
+self.num_positions = self.num_patches + 1
 ```
 
-Here are some examples:
+Overall, the number of placeholder feature tokens for an image can be calculated as:
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+```python
+def get_num_image_tokens(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> int:
+    hf_config = self.get_hf_config()
+    hf_processor = self.get_hf_processor()
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+
+    num_image_tokens = (image_size // patch_size) ** 2 + 1
+    if hf_processor.vision_feature_select_strategy == "default":
+        num_image_tokens -= 1
+
+    return num_image_tokens
 ```
 
-## 4. (Optional) Register dummy data
+Notice that the number of image tokens doesn't depend on the image width and height.
+So, we can calculate the maximum number of image tokens using any image size:
 
-During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+```python
+def get_image_size_with_most_features(self) -> ImageSize:
+    hf_config = self.get_hf_config()
+    width = height = hf_config.image_size
+    return ImageSize(width=width, height=height)
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+def get_max_image_tokens(self) -> int:
+    target_width, target_height = self.get_image_size_with_most_features()
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-+ @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+    return self.get_num_image_tokens(
+        image_width=target_width,
+        image_height=target_height,
+    )
+```
+
+And thus, we can override the method as:
+
+```python
+def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    return {"image": self.get_max_image_tokens()}
 ```
 
 ```{note}
-The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
+:::
+::::
+
+## 3. Specify dummy inputs
+
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
+HF processing as well as memory profiling.
+
+### For memory profiling
+
+Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
+to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
+the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
+on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+Making use of the `get_image_size_with_most_features` method implemented in the previous section:
+
+```python
+def get_dummy_processor_inputs(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> ProcessorInputs:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+  
+    hf_config = self.get_hf_config()
+    target_width, target_height = self.info.get_image_size_with_most_features()
+
+    mm_data = {
+        "image":
+        self._get_dummy_images(width=target_width,
+                               height=target_height,
+                               num_images=num_images)
+    }
+
+    return ProcessorInputs(
+        prompt_text=image_token * num_images,
+        mm_data=mm_data,
+    )
+```
+:::
+::::
 
-Here are some examples:
+## 4. Specify processing details
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
+to fill in the missing details about HF processing.
 
 ```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+[Multi-Modal Data Processing](#mm-processing)
 ```
 
-## 5. (Optional) Register input processor
+### Multi-modal fields
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
+return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the model's `forward` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404
+def forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[int] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+```
 
-Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor.
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call.
-You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+The only related keyword argument is `pixel_values` which directly corresponds to input images.
+The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images.
+So, we override the method as follows:
+
+```python
+def _get_mm_fields_config(
+    self,
+    hf_inputs: BatchFeature,
+    hf_processor_mm_kwargs: Mapping[str, object],
+) -> Mapping[str, MultiModalFieldConfig]:
+    return dict(
+        pixel_values=MultiModalFieldConfig.batched("image"),
+    )
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+```{note}
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
+```
+:::
+::::
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-+ @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+### Prompt replacements
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
+return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
+
+Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
+operation performed by the HF processor.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at HF's `LlavaProcessor`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
+prompt_strings = []
+for sample in text:
+    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+    prompt_strings.append(sample)
 ```
 
-A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
-Here are some examples:
+It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
+Based on this, we override the method as follows:
+
+```python
+def _get_prompt_replacements(
+    self,
+    mm_items: MultiModalDataItems,
+    hf_processor_mm_kwargs: Mapping[str, object],
+    out_mm_kwargs: MultiModalKwargs,
+) -> list[PromptReplacement]:
+    hf_config = self.info.get_hf_config()
+    image_token_id = hf_config.image_token_index
+
+    def get_replacement(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+
+        image_size = images.get_image_size(item_idx)
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+
+        return [image_token_id] * num_image_tokens
+
+    return [
+        PromptReplacement(
+            modality="image",
+            target=[image_token_id],
+            replacement=get_replacement,
+        ),
+    ]
+```
+:::
+::::
 
-- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+## 5. Register processor-related classes
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
+{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
+and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
+decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
+to register them to the multi-modal registry:
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
++                                         info=YourProcessingInfo,
++                                         dummy_inputs=YourDummyInputsBuilder)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index fe5aa94c52896..6a9262669cd29 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -48,7 +48,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus
 
 ```{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#enabling-multimodal-inputs).
+Read more about that [here](#supports-multimodal).
 ```
 
 ```{note}
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
deleted file mode 100644
index bb16920e3d0c0..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.md
+++ /dev/null
@@ -1,19 +0,0 @@
-(input-processing-pipeline)=
-
-# Input Processing Pipeline
-
-1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using {meth}`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
-
-   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
-
-   - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
deleted file mode 100644
index cb415366e5a66..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.md
+++ /dev/null
@@ -1,43 +0,0 @@
-(input-processing)=
-
-# Input Processing
-
-```{eval-rst}
-.. currentmodule:: vllm.inputs
-```
-
-Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
-{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-input_processing_pipeline
-```
-
-## Module Contents
-
-### LLM Engine Inputs
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
-    :members:
-    :show-inheritance:
-```
-
-### Registry
-
-```{eval-rst}
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-```
-
-```{eval-rst}
-.. automodule:: vllm.inputs.registry
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
new file mode 100644
index 0000000000000..a0d01205e638c
--- /dev/null
+++ b/docs/source/design/mm_processing.md
@@ -0,0 +1,64 @@
+(mm-processing)=
+
+# Multi-Modal Data Processing
+
+To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+
+Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
+
+## Prompt Replacement Detection
+
+One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+
+In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
+
+## Tokenized Prompt Inputs
+
+To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
+
+### The problem
+
+Consider that HF processors follow these main steps:
+
+1. Tokenize the text
+2. Process multi-modal inputs
+3. Perform prompt replacement
+
+And we require that:
+
+- For text + multi-modal inputs, apply all steps 1--3.
+- For tokenized + multi-modal inputs, apply only steps 2--3.
+
+How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
+
+- For text + multi-modal inputs, simply call the HF processor directly.
+- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
+
+While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
+
+Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
+
+(mm-dummy-text)=
+
+### Dummy text
+
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+
+(mm-automatic-prompt-replacement)=
+
+### Automatic prompt replacement
+
+We address the second issue by implementing model-agnostic code in
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
+
+### Summary
+
+With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+
+## Processor Output Caching
+
+Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+
+When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
+
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/docs/source/index.md b/docs/source/index.md
index 356fa4b7fd573..de74276a50fb6 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -145,7 +145,7 @@ design/arch_overview
 design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
-design/input_processing/model_inputs_index
+design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 7e96ed46f2dcc..a06f121a6899a 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -14,7 +14,7 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch
 To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
 
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
-- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
 
 ### Image
 
diff --git a/vllm/config.py b/vllm/config.py
index 13b5390008a35..59b509d5a961e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2124,8 +2124,7 @@ class MultiModalConfig:
 
     limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
     """
-    The maximum number of multi-modal input instances allowed per prompt
-    for each :class:`~vllm.multimodal.MultiModalPlugin`.
+    The maximum number of input items allowed per prompt for each modality.
     """
 
     def compute_hash(self) -> str:
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index aaeecab7ffde1..a0dd89f69bacd 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -11,9 +11,6 @@
 """
 The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
 to dispatch data processing according to the target model.
-
-See also:
-    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index aad0dfab94a01..4b73ade7af5f0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -313,9 +313,6 @@ def dummy_data_for_profiling(
 
         The model is identified by ``model_config``.
 
-        See also:
-            :ref:`enabling-multimodal-inputs`
-
         Note:
             This should be called after
             :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
@@ -384,10 +381,8 @@ def register_input_processor(self, processor: InputProcessor):
         Register an input processor to a model class.
 
         The provided function is invoked on each input to the model. This
-        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
-
-        See also:
-            :ref:`input-processing-pipeline`
+        happens before
+        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
         """
 
         def wrapper(model_cls: N) -> N:
@@ -429,9 +424,6 @@ def process_input(self, model_config: "ModelConfig",
         Apply an input processor to an instance of model inputs.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`input-processing-pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 343b9322ecc5e..1d7f5d57fa24e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,10 +8,10 @@
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
 The global :class:`~MultiModalRegistry` is used by model runners to
-dispatch data processing according to its modality and the target model.
+dispatch data processing according to the target model.
 
 See also:
-    :ref:`input-processing-pipeline`
+    :ref:`mm-processing`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4941fbac963ca..fd3ec7e0ec8ce 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -90,10 +90,6 @@ def register_input_mapper(
         invoked to transform the data into a dictionary of model inputs.
 
         If `None` is provided, then the default input mapper is used instead.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -126,10 +122,6 @@ def map_input(
 
         Raises:
             TypeError: If the data type is not supported.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         # Avoid circular import
@@ -186,9 +178,6 @@ def register_max_multimodal_tokens(
         for a model class.
 
         If `None` is provided, then the default calculation is used instead.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -218,9 +207,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         If this registry is not applicable to the model, `0` is returned.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8680e4175593b..4b63703585214 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -493,7 +493,8 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 class MultiModalInputsV2(TypedDict):
     """
-    Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`,
+    Represents the outputs of
+    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
     ready to be passed to vLLM internals.
     """
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9eceefb08c93f..804a91da8c889 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -100,8 +100,7 @@ def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
 
 class MultiModalRegistry:
     """
-    A registry that dispatches data processing to the
-    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
+    A registry that dispatches data processing according to the model.
     """
 
     DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
@@ -367,8 +366,7 @@ def register_processor(
         invoked to transform the data into a dictionary of model inputs.
 
         See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
+            :ref:`mm-processing`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -398,6 +396,9 @@ def _get_model_cls(self, model_config: "ModelConfig"):
     def has_processor(self, model_config: "ModelConfig") -> bool:
         """
         Test whether a multi-modal processor is defined for a specific model.
+
+        See also:
+            :ref:`mm-processing`
         """
         return self._get_model_cls(model_config) in self._processor_factories
 
@@ -408,6 +409,9 @@ def create_processor(
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
+
+        See also:
+            :ref:`mm-processing`
         """
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]

From 20410b2fdac1818ead453018fb07c2ff90ee6770 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 23:46:51 +0800
Subject: [PATCH 074/115] [platform] support custom torch.compile backend key
 (#11318)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py        | 3 ++-
 vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +-
 vllm/model_executor/models/commandr.py                 | 3 ++-
 vllm/model_executor/models/phi3_small.py               | 5 +++--
 vllm/platforms/interface.py                            | 6 ++++++
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index f173cbde03f44..9d6c3797c62fc 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeStochasticBaseSampler)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -368,7 +369,7 @@ def _smallest_positive_value(self) -> float:
 # Note that we always sample with replacement.
 # probs will be modified in place, but this is fine, as we pass
 # in a copy already.
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 30548e656c557..65920aa61ba15 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -133,7 +133,7 @@ def __post_init__(self):
         assert self.num_added_elements <= self.num_added_elements_padded
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def get_masked_input_and_mask(
         input_: torch.Tensor, org_vocab_start_index: int,
         org_vocab_end_index: int, num_org_vocab_padding: int,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index d22d1f3171463..8d61ece289412 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -45,6 +45,7 @@
     row_parallel_weight_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -53,7 +54,7 @@
                     maybe_prefix)
 
 
-@torch.compile
+@torch.compile(backend=current_platform.simple_compile_backend)
 def layer_norm_func(hidden_states, weight, variance_epsilon):
     input_dtype = hidden_states.dtype
     hidden_states = hidden_states.to(torch.float32)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index da7e4cdbc6940..f47676b934e4e 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -20,6 +20,7 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
@@ -54,12 +55,12 @@ def weight_loader(self, param: torch.nn.Parameter,
         return load_column_parallel_weight(param, loaded_weight)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def quick_gelu(x):
     return x * torch.sigmoid(1.702 * x)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def gegelu(input, limit: Optional[float] = None):
     a_gelu, a_linear = input[..., ::2], input[..., 1::2]
     if limit is not None:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 01d753408e6d0..fe398801c5dd9 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -82,6 +82,12 @@ class Platform:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
     supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:

From 482cdc494e608b72303f49b56532f5c50b61cbdb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:50:29 +0000
Subject: [PATCH 075/115] [Doc] Rename offline inference examples (#11927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 ++--
 .buildkite/test-pipeline.yaml                 | 20 +++++++++----------
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/features/structured_outputs.md    |  2 +-
 .../getting_started/installation/cpu-x86.md   |  4 ++--
 docs/source/getting_started/quickstart.md     |  2 +-
 docs/source/models/generative_models.md       |  4 ++--
 docs/source/models/pooling_models.md          |  6 +++---
 docs/source/serving/multimodal_inputs.md      |  8 ++++----
 ...{offline_inference_arctic.py => arctic.py} |  0
 ...ce_audio_language.py => audio_language.py} |  0
 .../{offline_inference.py => basic.py}        |  0
 ...y => basic_with_model_default_sampling.py} |  0
 .../{offline_inference_chat.py => chat.py}    |  0
 ..._chat_with_tools.py => chat_with_tools.py} |  0
 ...ce_classification.py => classification.py} |  0
 .../{offline_inference_cli.py => cli.py}      |  0
 ...nference_distributed.py => distributed.py} |  0
 ...ne_inference_embedding.py => embedding.py} |  0
 ..._encoder_decoder.py => encoder_decoder.py} |  0
 .../offline_inference/florence2_inference.py  |  2 +-
 ...ence_mlpspeculator.py => mlpspeculator.py} |  0
 ...{offline_inference_neuron.py => neuron.py} |  0
 ...ization.py => neuron_int8_quantization.py} |  0
 .../openai_batch.md}                          | 18 ++++++++---------
 .../openai_example_batch.jsonl                |  0
 ...ffline_inference_pixtral.py => pixtral.py} |  0
 ...rence_with_prefix.py => prefix_caching.py} |  0
 .../{offline_profile.py => profiling.py}      |  2 +-
 ...ffline_inference_scoring.py => scoring.py} |  0
 ...e_with_profiler.py => simple_profiling.py} |  0
 ...tured_outputs.py => structured_outputs.py} |  0
 .../{offline_inference_tpu.py => tpu.py}      |  0
 ..._vision_language.py => vision_language.py} |  0
 ...edding.py => vision_language_embedding.py} |  0
 ...mage.py => vision_language_multi_image.py} |  0
 ...ffline_inference_whisper.py => whisper.py} |  0
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py |  2 +-
 46 files changed, 46 insertions(+), 46 deletions(-)
 rename examples/offline_inference/{offline_inference_arctic.py => arctic.py} (100%)
 rename examples/offline_inference/{offline_inference_audio_language.py => audio_language.py} (100%)
 rename examples/offline_inference/{offline_inference.py => basic.py} (100%)
 rename examples/offline_inference/{offline_inference_with_default_generation_config.py => basic_with_model_default_sampling.py} (100%)
 rename examples/offline_inference/{offline_inference_chat.py => chat.py} (100%)
 rename examples/offline_inference/{offline_chat_with_tools.py => chat_with_tools.py} (100%)
 rename examples/offline_inference/{offline_inference_classification.py => classification.py} (100%)
 rename examples/offline_inference/{offline_inference_cli.py => cli.py} (100%)
 rename examples/offline_inference/{offline_inference_distributed.py => distributed.py} (100%)
 rename examples/offline_inference/{offline_inference_embedding.py => embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_encoder_decoder.py => encoder_decoder.py} (100%)
 rename examples/offline_inference/{offline_inference_mlpspeculator.py => mlpspeculator.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron.py => neuron.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron_int8_quantization.py => neuron_int8_quantization.py} (100%)
 rename examples/offline_inference/{offline_inference_openai/offline_inference_openai.md => openai/openai_batch.md} (92%)
 rename examples/offline_inference/{offline_inference_openai => openai}/openai_example_batch.jsonl (100%)
 rename examples/offline_inference/{offline_inference_pixtral.py => pixtral.py} (100%)
 rename examples/offline_inference/{offline_inference_with_prefix.py => prefix_caching.py} (100%)
 rename examples/offline_inference/{offline_profile.py => profiling.py} (99%)
 rename examples/offline_inference/{offline_inference_scoring.py => scoring.py} (100%)
 rename examples/offline_inference/{offline_inference_with_profiler.py => simple_profiling.py} (100%)
 rename examples/offline_inference/{offline_inference_structured_outputs.py => structured_outputs.py} (100%)
 rename examples/offline_inference/{offline_inference_tpu.py => tpu.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language.py => vision_language.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_embedding.py => vision_language_embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_multi_image.py => vision_language_multi_image.py} (100%)
 rename examples/offline_inference/{offline_inference_whisper.py => whisper.py} (100%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 5a285be039393..4ae66f6f3215a 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference/offline_inference.py"
+    python3 examples/offline_inference/basic.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 1e5ff77895a38..3e4e409466b8a 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/basic.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index a50570ab53438..8f3b08212fd6a 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 52d485939b1d0..189714ebb6d75 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 380f7a44a429a..6159b21ff8206 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index a8f021890f742..650af0fac4c61 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
     && python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 160e10aa3bb9b..4d344e58db8ac 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/offline_inference.py
-    python3 examples/offline_inference/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7d13269540864..d3bd809cfdf24 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/basic.py
     - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/offline_inference_chat.py
-    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/chat.py
+    - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/offline_inference_vision_language.py
-    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
     - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/offline_inference_encoder_decoder.py
-    - python3 offline_inference/offline_inference_classification.py
-    - python3 offline_inference/offline_inference_embedding.py
-    - python3 offline_inference/offline_inference_scoring.py
-    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 97de40ff469f1..001db86bdf555 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index a42c3dd64ad10..1d77c7339a33f 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/structured_outputs.py>
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bb046dd0fd9dc..f4d3eec0377b1 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/offline_inference.py # run vLLM
+$ python examples/offline_inference/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/offline_inference.py
+$ python examples/offline_inference/basic.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index d7d43785c6c24..6fd0083a9bb7b 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6a5a58ad74ab7..e4b4cd03a90d2 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 324b1f550e694..91db694be29a4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
 
 ### `LLM.classify`
 
@@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/classification.py>
 
 ### `LLM.score`
 
@@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
 
 ## Online Serving
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index a06f121a6899a..53f5a274e39a3 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/audio_language.py>
 
 ### Embedding
 
diff --git a/examples/offline_inference/offline_inference_arctic.py b/examples/offline_inference/arctic.py
similarity index 100%
rename from examples/offline_inference/offline_inference_arctic.py
rename to examples/offline_inference/arctic.py
diff --git a/examples/offline_inference/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_audio_language.py
rename to examples/offline_inference/audio_language.py
diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/basic.py
similarity index 100%
rename from examples/offline_inference/offline_inference.py
rename to examples/offline_inference/basic.py
diff --git a/examples/offline_inference/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/basic_with_model_default_sampling.py
diff --git a/examples/offline_inference/offline_inference_chat.py b/examples/offline_inference/chat.py
similarity index 100%
rename from examples/offline_inference/offline_inference_chat.py
rename to examples/offline_inference/chat.py
diff --git a/examples/offline_inference/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
similarity index 100%
rename from examples/offline_inference/offline_chat_with_tools.py
rename to examples/offline_inference/chat_with_tools.py
diff --git a/examples/offline_inference/offline_inference_classification.py b/examples/offline_inference/classification.py
similarity index 100%
rename from examples/offline_inference/offline_inference_classification.py
rename to examples/offline_inference/classification.py
diff --git a/examples/offline_inference/offline_inference_cli.py b/examples/offline_inference/cli.py
similarity index 100%
rename from examples/offline_inference/offline_inference_cli.py
rename to examples/offline_inference/cli.py
diff --git a/examples/offline_inference/offline_inference_distributed.py b/examples/offline_inference/distributed.py
similarity index 100%
rename from examples/offline_inference/offline_inference_distributed.py
rename to examples/offline_inference/distributed.py
diff --git a/examples/offline_inference/offline_inference_embedding.py b/examples/offline_inference/embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_embedding.py
rename to examples/offline_inference/embedding.py
diff --git a/examples/offline_inference/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
similarity index 100%
rename from examples/offline_inference/offline_inference_encoder_decoder.py
rename to examples/offline_inference/encoder_decoder.py
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index 49dd2c331db5a..c24096e90004b 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,7 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference/offline_inference_vision_language.py
+# Move to offline_inference/vision_language.py
 # after porting vision backbone
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
similarity index 100%
rename from examples/offline_inference/offline_inference_mlpspeculator.py
rename to examples/offline_inference/mlpspeculator.py
diff --git a/examples/offline_inference/offline_inference_neuron.py b/examples/offline_inference/neuron.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron.py
rename to examples/offline_inference/neuron.py
diff --git a/examples/offline_inference/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/neuron_int8_quantization.py
diff --git a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md
similarity index 92%
rename from examples/offline_inference/offline_inference_openai/offline_inference_openai.md
rename to examples/offline_inference/openai/openai_batch.md
index 6278a1943fe4a..a4774e57cd9a5 100644
--- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
rename to examples/offline_inference/openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py
similarity index 100%
rename from examples/offline_inference/offline_inference_pixtral.py
rename to examples/offline_inference/pixtral.py
diff --git a/examples/offline_inference/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_prefix.py
rename to examples/offline_inference/prefix_caching.py
diff --git a/examples/offline_inference/offline_profile.py b/examples/offline_inference/profiling.py
similarity index 99%
rename from examples/offline_inference/offline_profile.py
rename to examples/offline_inference/profiling.py
index 187a05e4d70a2..8a94b5c2a8623 100644
--- a/examples/offline_inference/offline_profile.py
+++ b/examples/offline_inference/profiling.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_inference/offline_profile.py \\
+    python examples/offline_inference/profiling.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/offline_inference/offline_inference_scoring.py b/examples/offline_inference/scoring.py
similarity index 100%
rename from examples/offline_inference/offline_inference_scoring.py
rename to examples/offline_inference/scoring.py
diff --git a/examples/offline_inference/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_profiler.py
rename to examples/offline_inference/simple_profiling.py
diff --git a/examples/offline_inference/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py
similarity index 100%
rename from examples/offline_inference/offline_inference_structured_outputs.py
rename to examples/offline_inference/structured_outputs.py
diff --git a/examples/offline_inference/offline_inference_tpu.py b/examples/offline_inference/tpu.py
similarity index 100%
rename from examples/offline_inference/offline_inference_tpu.py
rename to examples/offline_inference/tpu.py
diff --git a/examples/offline_inference/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language.py
rename to examples/offline_inference/vision_language.py
diff --git a/examples/offline_inference/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/vision_language_embedding.py
diff --git a/examples/offline_inference/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/vision_language_multi_image.py
diff --git a/examples/offline_inference/offline_inference_whisper.py b/examples/offline_inference/whisper.py
similarity index 100%
rename from examples/offline_inference/offline_inference_whisper.py
rename to examples/offline_inference/whisper.py
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 57518bd3e8299..69698b34c71a3 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/offline_inference.py")
+        "examples", "offline_inference/basic.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 49366abc7fb56..54cd60c2bc95b 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_inference/offline_profile.py")
+                        "examples/offline_inference/profiling.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index fa88ed4204d8f..cb56ebd69a8c1 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
                         type=str,
                         required=True,
                         help="json trace file output by \
-                              examples/offline_inference/offline_profile.py")
+                              examples/offline_inference/profiling.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,

From f33e033e2782a9258d8ef6a359643944629d4ced Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:02 +0800
Subject: [PATCH 076/115] [Docs] Fix docstring in `get_ip` function (#11932)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 8c3e5200b3d98..217ccb25cef6d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -446,7 +446,7 @@ def get_ip() -> str:
         logger.warning(
             "The environment variable HOST_IP is deprecated and ignored, as"
             " it is often used by Docker and other software to"
-            "interact with the container's network stack. Please"
+            "interact with the container's network stack. Please "
             "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
             " to communicate with each other.")
     if host_ip:

From 5959564f94180a6a50e0d394e35a035c0c98a7fb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:43 +0800
Subject: [PATCH 077/115] Doc fix in `benchmark_long_document_qa_throughput.py`
 (#11933)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 benchmarks/benchmark_long_document_qa_throughput.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 13477ef535e86..0b8fba38156f1 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -2,8 +2,7 @@
 Offline benchmark to test the long document QA throughput.
 
 Example usage:
-    # This command run the vllm with 50GB CPU memory for offloading
-    # The workload samples 8 different prompts with a default input
+    # This workload samples 8 different prompts with a default input
     # length of 20000 tokens, then replicates each prompt 2 times 
     # in random order.
     python benchmark_long_document_qa_throughput.py \

From aa1e77a19ce658abcbaa0836f96878a7ae9dea84 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 11 Jan 2025 00:07:58 +0800
Subject: [PATCH 078/115] [Hardware][CPU] Support MOE models on x86 CPU
 (#11831)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../getting_started/installation/cpu-x86.md   |  2 +-
 .../decoder_only/language/test_models.py      |  4 ++
 vllm/model_executor/layers/fused_moe/layer.py | 41 +++++++++++++++++--
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index f4d3eec0377b1..26bdcd93ad190 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -5,7 +5,7 @@
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
 - Tensor Parallel
-- Model Quantization (`INT8 W8A8, AWQ`)
+- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
 - FP8-E5M2 KV-Caching (TODO)
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 2a7ed8826d2f3..4e110366a09f3 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -48,6 +48,10 @@
         ),
         pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param(
+            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            marks=[pytest.mark.cpu_model],
+        )
     ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b108cbd52c218..cf5db368926b4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,6 +13,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -83,6 +84,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        if current_platform.is_cpu():
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                import intel_extension_for_pytorch as ipex
+                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    use_prepack=True,
+                )
+            else:
+                raise NotImplementedError("CPU MOE only supports x86 arch.")
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -142,9 +157,29 @@ def forward_cuda(
                              topk_ids=topk_ids,
                              inplace=True)
 
-    def forward_cpu(self, *args, **kwargs):
-        raise NotImplementedError(
-            "The CPU backend currently does not support MoE.")
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        **kwargs,
+    ):
+        assert custom_routing_function is None
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+        )
 
     def forward_tpu(
         self,

From 46fa98ccad444dbacc7f95995b79f65ddab3ff7c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 11 Jan 2025 03:19:15 +0800
Subject: [PATCH 079/115] [Misc] Clean up debug code in Deepseek-V3 (#11930)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/deepseek_v3.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 333dc019b4d99..d4710622681b5 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -639,9 +639,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     if is_pp_missing_parameter(name, self):
                         continue
 
-                    if name not in params_dict:
-                        for key in params_dict:
-                            print(key)
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)

From 8a579408f33e2f98a89d57418342b53a41622d2f Mon Sep 17 00:00:00 2001
From: minmin <rmm0811@gmail.com>
Date: Sat, 11 Jan 2025 04:39:22 +0800
Subject: [PATCH 080/115] [Misc] Update benchmark_prefix_caching.py fixed
 example usage (#11920)

Signed-off-by: Ren MinMin <renmm6@chinaunicom.cn>
Co-authored-by: Ren MinMin <renmm6@chinaunicom.cn>
---
 benchmarks/benchmark_prefix_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 5e9381f712e10..3ab421a89c935 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -10,7 +10,8 @@
         --model meta-llama/Llama-2-7b-chat-hf \
         --enable-prefix-caching \
         --num-prompts 1 \
-        --repeat-count 100
+        --repeat-count 100 \
+        --input-length-range 128:256
 
 ShareGPT example usage:
     # This command samples 20 prompts with input lengths

From d45cbe70f5bf25bb2f490f4152c256e9acb2a62b Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 10 Jan 2025 16:26:00 -0700
Subject: [PATCH 081/115] [Bugfix] Check that number of images matches number
 of <|image|> tokens with mllama (#11939)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/model_executor/models/mllama.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index c5046e06edecb..593a4d3fb6940 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -123,6 +123,13 @@ def input_processor_for_mllama(
 
     assert is_list_of(image_data, Image.Image)
 
+    num_image_tokens = dec_inputs['prompt_token_ids'].count(
+        MLLAMA_IMAGE_TOKEN_ID)
+    if num_image_tokens != len(image_data):
+        raise ValueError(
+            f"The number of image tokens ({num_image_tokens}) must be"
+            f" the same as the number of images ({len(image_data)})")
+
     # Since only the last group of consecutive images
     # are attended by the decoded tokens, we only need to
     # get the number of tiles for those images.
@@ -1493,6 +1500,8 @@ def convert_sparse_cross_attention_mask_to_dense(
             dense_mask[seq_start + start:seq_start + end,
                        tile_start:tile_start + tile] = 1
             tile_start += tile
+        assert ts != -1
+        assert td != 0
         tile_range_for_decode.append((ts, ts + td))
         seq_start += length
 

From c9f09a4fe83ef13824ea1663214ac7aad08d2b31 Mon Sep 17 00:00:00 2001
From: Fred Reiss <frreiss@us.ibm.com>
Date: Fri, 10 Jan 2025 17:04:58 -0800
Subject: [PATCH 082/115] [mypy] Fix mypy warnings in api_server.py (#11941)

Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
---
 vllm/entrypoints/openai/api_server.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 925d7db43138b..1aeefe86cd05e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -14,7 +14,7 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Optional, Set, Tuple
+from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union
 
 import uvloop
 from fastapi import APIRouter, FastAPI, HTTPException, Request
@@ -420,6 +420,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
             "use the Pooling API (`/pooling`) instead.")
 
         res = await fallback_handler.create_pooling(request, raw_request)
+
+        generator: Union[ErrorResponse, EmbeddingResponse]
         if isinstance(res, PoolingResponse):
             generator = EmbeddingResponse(
                 id=res.id,
@@ -494,7 +496,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
-TASK_HANDLERS = {
+TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
         "default": (CompletionRequest, create_completion),
@@ -652,7 +654,7 @@ async def add_request_id(request: Request, call_next):
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)
         if inspect.isclass(imported):
-            app.add_middleware(imported)
+            app.add_middleware(imported)  # type: ignore[arg-type]
         elif inspect.iscoroutinefunction(imported):
             app.middleware("http")(imported)
         else:

From 899136b857d510f0e19b0e21ea32b49f8aa117ed Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 11 Jan 2025 09:07:24 +0800
Subject: [PATCH 083/115] [ci] fix broken distributed-tests-4-gpus (#11937)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/spec_decode/e2e/test_integration_dist_tp4.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 555aef99218c3..2cb10de1c6f55 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -108,7 +108,8 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
 
     TODO: fix it to pass without raising Error. (#5814)
     """
-    with pytest.raises(openai.APIConnectionError):
+    with pytest.raises(
+        (openai.APIConnectionError, openai.InternalServerError)):
         run_equality_correctness_test_tp(MAIN_MODEL,
                                          common_llm_kwargs,
                                          per_test_common_llm_kwargs,

From 2118d0565cb52c69e0bbacdcf48af9ecef39e528 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sat, 11 Jan 2025 13:49:38 +0900
Subject: [PATCH 084/115] [Bugfix][SpecDecode] Adjust Eagle model architecture
 to align with intended design (#11672)

Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
---
 vllm/model_executor/models/eagle.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index f138d13630263..eb7b5af19ae96 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -17,14 +17,30 @@
 from .utils import maybe_prefix
 
 
+class DummyInputLayerNorm(nn.Module):
+
+    def forward(self, x):
+        return x
+
+
+class DummyOutputNorm(nn.Module):
+
+    def forward(self, x, residual):
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
 class EAGLE(nn.Module):
     """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077
     Reference implementation: https://github.com/SafeAILab/EAGLE
     
     Differences from reference implementation:
     1. In reference, LlamaDecoderLayer implementation doesn't have 
-       input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427) 
-       but we do as HF implementation also does.
+       input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427).
+       Following this approach, our implementation also disables
+       the input_layernorm for the first decoder layer.
     2. We allow any decoder layer to be used in EAGLE whereas in reference 
        decoder layer is fixed to be LlamaDecoderLayer.
     3. We have an optional token_map which reduces draft vocab to most 
@@ -46,10 +62,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.model = model_cls(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))
+
         self.fc = nn.Linear(config.model.hidden_size * 2,
                             config.model.hidden_size,
                             bias=getattr(self.config, "eagle_fc_bias", False))
 
+        # Modify layer normalization and residual connections as suggested
+        # in the EAGLE framework: https://github.com/SafeAILab/EAGLE
+        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm()
+        self.model.model.norm = DummyOutputNorm()
+
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
         self.unpadded_vocab_size = self.truncated_vocab_size

From c32a7c7c0c688ed81d2f4ad701a09d0edd095ffe Mon Sep 17 00:00:00 2001
From: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Date: Sat, 11 Jan 2025 13:49:39 +0800
Subject: [PATCH 085/115] [Bugfix] fused_experts_impl wrong compute type for
 float32 (#11921)

Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1bb6bc753d37c..3ea6217d7c0ef 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -701,8 +701,14 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
-    compute_type = (tl.bfloat16
-                    if hidden_states.dtype == torch.bfloat16 else tl.float16)
+    if hidden_states.dtype == torch.bfloat16:
+        compute_type = tl.bfloat16
+    elif hidden_states.dtype == torch.float16:
+        compute_type = tl.float16
+    elif hidden_states.dtype == torch.float32:
+        compute_type = tl.float32
+    else:
+        raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
     if inplace:
         out_hidden_states = hidden_states

From 7a3a83e3b87f50fe9c0985a5c5bcc1d4cf2e95cd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 11 Jan 2025 13:50:05 +0800
Subject: [PATCH 086/115] [CI/Build] Move model-specific multi-modal processing
 tests (#11934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../processing => multimodal}/__init__.py     |   0
 .../models/multimodal/processing/__init__.py  |   0
 .../multimodal/processing/test_common.py      | 201 +++++++++++++++
 .../processing/test_idefics3.py               |   4 +-
 .../processing/test_internvl.py               |   4 +-
 .../processing/test_llava_next.py             |   2 +-
 .../processing/test_llava_onevision.py        |   2 +-
 .../processing/test_phi3v.py                  |   4 +-
 .../processing/test_qwen.py                   |   4 +-
 .../processing/test_qwen2_vl.py               |   4 +-
 tests/multimodal/test_processing.py           | 232 +-----------------
 tests/multimodal/utils.py                     |  33 +++
 13 files changed, 251 insertions(+), 240 deletions(-)
 rename tests/models/{decoder_only/vision_language/processing => multimodal}/__init__.py (100%)
 create mode 100644 tests/models/multimodal/processing/__init__.py
 create mode 100644 tests/models/multimodal/processing/test_common.py
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_idefics3.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_internvl.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_llava_next.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_llava_onevision.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_phi3v.py (95%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_qwen.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_qwen2_vl.py (96%)
 create mode 100644 tests/multimodal/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d3bd809cfdf24..cf82210f96ee3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -368,6 +368,7 @@ steps:
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
diff --git a/tests/models/decoder_only/vision_language/processing/__init__.py b/tests/models/multimodal/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/processing/__init__.py
rename to tests/models/multimodal/__init__.py
diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
new file mode 100644
index 0000000000000..0a38779e0e4f0
--- /dev/null
+++ b/tests/models/multimodal/processing/test_common.py
@@ -0,0 +1,201 @@
+from functools import partial
+
+import numpy as np
+import pytest
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import ProcessingCache
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from ....multimodal.utils import random_audio, random_image, random_video
+
+
+def _test_processing_correctness(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
+        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    else:
+        hf_overrides = {}
+
+    limit_mm_per_prompt = {
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
+    }
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=hf_overrides,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(random_image, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(random_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
+            for k in modalities
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        ).prompt_text
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+
+# yapf: disable
+# True if the model supports multiple data items of the modality per request
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
+    ("facebook/chameleon-7b", {"image": False}),
+    ("adept/fuyu-8b", {"image": False}),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
+    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
+    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness_phi3v(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+    _test_processing_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
diff --git a/tests/models/decoder_only/vision_language/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_idefics3.py
rename to tests/models/multimodal/processing/test_idefics3.py
index c71a2d359043d..69b91ad4a5df8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -8,8 +8,8 @@
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
 
diff --git a/tests/models/decoder_only/vision_language/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_internvl.py
rename to tests/models/multimodal/processing/test_internvl.py
index af0c2aa211998..d6c60595ca5ea 100644
--- a/tests/models/decoder_only/vision_language/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -7,8 +7,8 @@
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 models = ["OpenGVLab/InternVL2-2B"]
 
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/processing/test_llava_next.py
rename to tests/models/multimodal/processing/test_llava_next.py
index 689d17be81889..1eec35d9c3c72 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,7 @@
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from ....utils import build_model_context
+from ...utils import build_model_context
 
 
 def _validate_image_prompt_replacements_one(
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
rename to tests/models/multimodal/processing/test_llava_onevision.py
index a033354f0e9b8..94ea604c58b43 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,7 @@
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from ....utils import build_model_context
+from ...utils import build_model_context
 
 
 def _validate_image_prompt_replacements_one(
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
similarity index 95%
rename from tests/models/decoder_only/vision_language/processing/test_phi3v.py
rename to tests/models/multimodal/processing/test_phi3v.py
index c5b77260c6544..7f82a8f18f0ca 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -4,8 +4,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_qwen.py
rename to tests/models/multimodal/processing/test_qwen.py
index 163220c91a27d..af0ace711ba3e 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen.py
+++ b/tests/models/multimodal/processing/test_qwen.py
@@ -9,8 +9,8 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import IMAGE_ASSETS
-from ....utils import build_model_context
+from ....conftest import IMAGE_ASSETS
+from ...utils import build_model_context
 
 ### Multimodal preprocessing tests
 SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
similarity index 96%
rename from tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
rename to tests/models/multimodal/processing/test_qwen2_vl.py
index 0d54802f2b733..de14fbbffe5b7 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,8 +3,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d18909a4197b6..54269c3ef7ce0 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,30 +1,25 @@
 from contextlib import nullcontext
-from functools import partial
 from typing import cast
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
-from PIL import Image
 
 from vllm.config import ModelConfig
-from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
-                                        PromptReplacement,
+from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
-# yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
+from .utils import random_image
+
 
 # yapf: disable
 @pytest.mark.parametrize(
@@ -531,37 +526,6 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
-    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
-    return Image.fromarray(arr)
-
-
-def _rand_video(
-    rng: np.random.RandomState,
-    min_frames: int,
-    max_frames: int,
-    min_wh: int,
-    max_wh: int,
-):
-    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
-    num_frames = rng.randint(min_frames, max_frames)
-    num_frames = (num_frames // 2) * 2
-
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
-    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
-
-
-def _rand_audio(
-    rng: np.random.RandomState,
-    min_len: int,
-    max_len: int,
-    sr: int,
-):
-    audio_len = rng.randint(min_len, max_len)
-    return rng.rand(audio_len), sr
-
-
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
@@ -628,7 +592,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
     )
 
     rng = np.random.RandomState(0)
-    image = _rand_img(rng, min_wh=128, max_wh=256)
+    image = random_image(rng, min_wh=128, max_wh=256)
     if num_images == 0:
         mm_data = {}
     elif num_images == 1:
@@ -647,191 +611,3 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
             mm_data=mm_data,
             hf_processor_mm_kwargs={},
         )
-
-
-def _test_processing_correctness(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
-        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
-    else:
-        hf_overrides = {}
-
-    limit_mm_per_prompt = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
-    }
-
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=True,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        hf_overrides=hf_overrides,
-        limit_mm_per_prompt=limit_mm_per_prompt,
-    )
-
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
-    )
-    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity=1 << 30)
-
-    baseline_processor = factories.build_processor(ctx, cache=None)
-    cached_processor = factories.build_processor(ctx, cache=cache)
-    dummy_inputs = baseline_processor.dummy_inputs
-    tokenizer = baseline_processor.info.get_tokenizer()
-
-    rng = np.random.RandomState(0)
-
-    input_to_hit = {
-        "image": Image.new("RGB", size=(128, 128)),
-        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
-    }
-    input_factory = {
-        "image":
-        partial(_rand_img, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(_rand_video,
-                rng,
-                min_frames=2,
-                max_frames=8,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000),
-    }
-
-    for batch_idx in range(num_batches):
-        mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
-            for k in modalities
-        }
-
-        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = dummy_inputs.get_dummy_processor_inputs(
-            model_config.max_model_len,
-            mm_counts,
-        ).prompt_text
-
-        # Drop unnecessary keys and test single -> multi conversion
-        if rng.rand() < simplify_rate:
-            for k in list(mm_data.keys()):
-                if not mm_data[k]:
-                    del mm_data[k]
-                elif len(mm_data[k]) == 1:
-                    mm_data[k] = mm_data[k][0]
-
-        baseline_result = baseline_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-        cached_result = cached_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert baseline_result == baseline_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert cached_result == cached_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-
-# yapf: disable
-# True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": False}),
-    ("adept/fuyu-8b", {"image": False}),
-    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
-    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
-    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
-    ("mistral-community/pixtral-12b", {"image": True}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
-    ("fixie-ai/ultravox-v0_3", {"audio": True}),
-])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_id,
-        modalities,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
-# yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness_phi3v(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
-
-    _test_processing_correctness(
-        model_id,
-        modalities,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
new file mode 100644
index 0000000000000..29aeca605109b
--- /dev/null
+++ b/tests/multimodal/utils.py
@@ -0,0 +1,33 @@
+import numpy as np
+from PIL import Image
+
+
+def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def random_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+    num_frames = rng.randint(min_frames, max_frames)
+    num_frames = (num_frames // 2) * 2
+
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def random_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr

From a991f7d5085e3e9474531f78639766eac3af607c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 11 Jan 2025 21:27:24 +0800
Subject: [PATCH 087/115] [Doc] Basic guide for writing unit tests for new
 models (#11951)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/model/basic.md       |  2 +-
 docs/source/contributing/model/index.md       |  1 +
 .../source/contributing/model/registration.md |  3 +-
 docs/source/contributing/model/tests.md       | 63 +++++++++++++++++++
 tests/models/registry.py                      |  5 ++
 tests/models/test_initialization.py           | 10 +++
 6 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/contributing/model/tests.md

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 002808ac5fbbd..5c2dc486c8bea 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -1,6 +1,6 @@
 (new-model-basic)=
 
-# Basic Implementation
+# Implementing a Basic Model
 
 This guide walks you through the steps to implement a basic vLLM model.
 
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index 245e13b795ec4..fe018b61b08cf 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -10,6 +10,7 @@ This section provides more information on how to integrate a [PyTorch](https://p
 
 basic
 registration
+tests
 multimodal
 ```
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index 6a9262669cd29..d6c9e4181dfee 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -1,6 +1,6 @@
 (new-model-registration)=
 
-# Model Registration
+# Registering a Model to vLLM
 
 vLLM relies on a model registry to determine how to run each model.
 A list of pre-registered architectures can be found [here](#supported-models).
@@ -15,7 +15,6 @@ This gives you the ability to modify the codebase and test your model.
 
 After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
-You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
 Finally, update our [list of supported models](#supported-models) to promote your model!
 
 ```{important}
diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
new file mode 100644
index 0000000000000..74c933b2f45da
--- /dev/null
+++ b/docs/source/contributing/model/tests.md
@@ -0,0 +1,63 @@
+(new-model-tests)=
+
+# Writing Unit Tests
+
+This page explains how to write unit tests to verify the implementation of your model.
+
+## Required Tests
+
+These tests are necessary to get your PR merged into vLLM library.
+Without them, the CI for your PR will fail.
+
+### Model loading
+
+Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
+
+```{important}
+The list of models in each section should be maintained in alphabetical order.
+```
+
+```{tip}
+If your model requires a development version of HF Transformers, you can set
+`min_transformers_version` to skip the test in CI until the model is released.
+```
+
+## Optional Tests
+
+These tests are optional to get your PR merged into vLLM library.
+Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions.
+
+### Model correctness
+
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+
+#### Generative models
+
+For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+
+- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
+- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
+
+#### Pooling models
+
+For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
+
+(mm-processing-tests)=
+
+### Multi-modal processing
+
+#### Common tests
+
+Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+
+- Text + multi-modal data
+- Tokens + multi-modal data
+- Text + cached multi-modal data
+- Tokens + cached multi-modal data
+
+#### Model-specific tests
+
+You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index dcb8bfa0f9510..f5aaa8eb071f9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -22,6 +22,11 @@ class _HfExamplesInfo:
     for speculative decoding.
     """
 
+    min_transformers_version: Optional[str] = None
+    """
+    The minimum version of HF Transformers that is required to run this model.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 3b728f2744fca..7a564c1f4a1d0 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,9 @@
 from unittest.mock import patch
 
 import pytest
+from packaging.version import Version
 from transformers import PretrainedConfig
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm import LLM
 
@@ -13,6 +15,14 @@ def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
+    if model_info.min_transformers_version is not None:
+        current_version = TRANSFORMERS_VERSION
+        required_version = model_info.min_transformers_version
+        if Version(current_version) < Version(required_version):
+            pytest.skip(
+                f"You have `transformers=={current_version}` installed, but "
+                f"`transformers>={required_version}` is required to run this "
+                "model")
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:

From d697dc01b4a25b96c3a1e88d72a058f17a717fd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 11 Jan 2025 15:05:09 +0100
Subject: [PATCH 088/115] [Bugfix] Fix RobertaModel loading (#11940)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_model_load_with_params.py            | 27 +++++++++-
 .../embedding/language/test_embedding.py      |  1 +
 vllm/model_executor/models/roberta.py         | 51 +++++++++++++++----
 3 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index ed321ba9f00c1..0609fd96825e3 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm.model_executor.layers.pooler import PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -92,3 +92,28 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
         # assert output
         assert output
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_facebook_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test loading roberta-base model with no lm_head.
+    """
+    model_name = "FacebookAI/roberta-base"
+    with vllm_runner(model_name=model_name,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as model:
+        output = model.encode("Write a short story about a robot that"
+                              " dreams for the first time.\n")
+
+        model_tokenizer = model.model.llm_engine.tokenizer
+        assert model_tokenizer.tokenizer_id == model_name
+
+        model = model.model.llm_engine.model_executor\
+                     .driver_worker.model_runner.model
+        assert not hasattr(model, "lm_head")
+        assert isinstance(model, RobertaEmbeddingModel)
+        assert isinstance(model._pooler, CLSPool)
+
+        assert output
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 7749806548cd9..04ab4dd7371a3 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -25,6 +25,7 @@
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba1a78ac640fd..5997a76890c9d 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,3 +1,4 @@
+import itertools
 from typing import Iterable, List, Optional, Tuple
 
 import torch
@@ -20,6 +21,30 @@
 from .interfaces import SupportsCrossEncoding
 
 
+def roberta_task_weights_filter(
+    all_weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
+                                                              torch.Tensor]]]:
+    """
+    Separate task-specific weights that are applied on top
+    of the encoder-decoder bert base.
+    To do so, return two generators over the original iterator.
+    Also, remove the "roberta." prefix to make it loadable
+    from vanilla BertModel.
+    """
+    # Copy of a lazy iterator without in-memory overhead so both
+    # iterators can be iterated upon independently.
+    all_weights1, all_weights2 = itertools.tee(all_weights)
+
+    def encoder_decoder_weights():
+        for name, weight in all_weights1:
+            if name.startswith("roberta."):
+                yield (name[len("roberta."):], weight)
+
+    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
+                                       if not n.startswith("roberta."))
+
+
 class RobertaEmbedding(nn.Module):
 
     def __init__(self, config: RobertaConfig):
@@ -152,6 +177,18 @@ def _build_model(self,
                          prefix=prefix,
                          embedding_class=RobertaEmbedding)
 
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        # Separate weights in "roberta"-prefixed and all else (not in memory).
+        # For use with models like FacebookAI/roberta-base.
+        bert_weights, task_weights = roberta_task_weights_filter(weights)
+        loaded = self.model.load_weights(bert_weights)
+        if not len(loaded):
+            # Fix for models like `sentence-transformers/stsb-roberta-base-v2`
+            # which use the same architecture, but have no "roberta" prefix.
+            loaded = self.model.load_weights(task_weights)
+        assert len(loaded), "Unable to load RobertaEmbeddingModel"
+
 
 class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
@@ -181,20 +218,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
-        self_weights = []
-
-        def weight_filter():
-            for name, weight in weights:
-                if name.startswith("roberta."):
-                    yield (name[len("roberta."):], weight)
-                else:
-                    self_weights.append((name, weight))
-
-        self.roberta.load_weights(weight_filter())
+        bert_weights, task_weights = roberta_task_weights_filter(weights)
+        self.roberta.load_weights(bert_weights)
 
         params_dict = dict(self.named_parameters())
 
-        for name, loaded_weight in self_weights:
+        for name, loaded_weight in task_weights:
             if name.startswith("classifier"):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",

From 4b657d32922cb6d3179f901e312715279ad9c728 Mon Sep 17 00:00:00 2001
From: sixgod <evethwillbeok@outlook.com>
Date: Sun, 12 Jan 2025 03:05:56 +0800
Subject: [PATCH 089/115] [Model] Add cogagent model support vLLM (#11742)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/chatglm.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ffd6891b25965..7e37ce3086e6b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,6 +1,6 @@
 # Adapted from
-# https://github.com/THUDM/GLM-4
-"""Inference-only ChatGLM model compatible with THUDM weights."""
+# https://github.com/THUDM/CogAgent
+"""Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
 from array import array
 from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
@@ -201,7 +201,6 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     new_input_ids = []
     final_processed_position = 0
-    final_processed_position = 0
 
     for boi_position, eoi_position in zip(boi_positions, eoi_positions):
         assert boi_position < eoi_position
@@ -275,12 +274,15 @@ def __init__(
         # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
+        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # which is equivalent to is_neox_style=True
+        is_neox_style = not config.original_rope
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim // 2,
             max_position=max_positions,
             base=10000 * rope_ratio,
-            is_neox_style=False,
+            is_neox_style=is_neox_style,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -779,4 +781,4 @@ def __new__(
             return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
         # Initialize LLM
         else:
-            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
+            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
\ No newline at end of file

From b25cfab9a03b5c460fb92340b310d2a5c2dbc5da Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 11 Jan 2025 22:36:38 -0800
Subject: [PATCH 090/115] [V1] Avoid sending text prompt to core engine
 (#11963)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/__init__.py    | 4 ++--
 vllm/v1/engine/core_client.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 5e3c5e327ef63..3ce9db0e47eed 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -19,8 +19,8 @@ class EngineCoreRequest:
     # due to circular imports and typing we have in data.py
 
     request_id: str
-    #NOTE(Nick): I don't think we need to pass prompt here since it should
-    # always be tokenized?
+    # NOTE(ywang96): original text prompt is needed when a request is added to
+    # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: List[int]
     mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a4a45ae05ff9e..4ed7f944b058f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -219,6 +219,9 @@ def _send_input(self, request_type: EngineCoreRequestType,
         self.input_socket.send_multipart(msg, copy=False)
 
     def add_request(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -257,6 +260,9 @@ async def _send_input(self, request_type: EngineCoreRequestType,
         await self.input_socket.send_multipart(msg, copy=False)
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
 
     async def abort_requests_async(self, request_ids: List[str]) -> None:

From 43f3d9e6990811461ecb42bc50a17aad944d30f9 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Sun, 12 Jan 2025 03:17:13 -0500
Subject: [PATCH 091/115] [CI/Build] Add markdown linter (#11857)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 .../{sphinx-lint.yml => doc-lint.yml}         |   4 +-
 docs/README.md                                |   1 +
 docs/source/api/model/index.md                |   1 -
 docs/source/community/sponsors.md             |   2 +
 docs/source/contributing/model/multimodal.md  |   4 +
 docs/source/contributing/overview.md          |   2 -
 docs/source/deployment/docker.md              |   4 +-
 .../source/deployment/frameworks/cerebrium.md |  10 +-
 docs/source/deployment/frameworks/dstack.md   |  10 +-
 docs/source/deployment/frameworks/skypilot.md |  12 +-
 .../deployment/integrations/llamastack.md     |   2 +-
 docs/source/deployment/k8s.md                 | 451 +++++++++---------
 .../source/design/automatic_prefix_caching.md |  11 +-
 docs/source/features/quantization/auto_awq.md |   4 +-
 docs/source/features/quantization/bnb.md      |   7 +-
 docs/source/features/quantization/fp8.md      |   4 +-
 .../features/quantization/fp8_e4m3_kvcache.md |   2 +-
 docs/source/features/quantization/gguf.md     |  10 +-
 docs/source/features/quantization/int8.md     |   2 +-
 docs/source/features/spec_decode.md           |  10 +-
 docs/source/features/tool_calling.md          |  44 +-
 docs/source/getting_started/faq.md            |   2 +-
 .../getting_started/installation/cpu-apple.md |  17 +-
 .../getting_started/installation/cpu-x86.md   |  44 +-
 .../getting_started/installation/gpu-cuda.md  |  96 ++--
 .../getting_started/installation/gpu-rocm.md  | 116 ++---
 .../getting_started/installation/hpu-gaudi.md |  64 +--
 .../getting_started/installation/neuron.md    |   8 +-
 .../getting_started/installation/openvino.md  |  14 +-
 .../getting_started/installation/tpu.md       |   6 +-
 .../getting_started/installation/xpu.md       |  24 +-
 docs/source/getting_started/quickstart.md     |  50 +-
 .../source/getting_started/troubleshooting.md |   6 +-
 docs/source/index.md                          |   4 +-
 .../models/extensions/runai_model_streamer.md |  12 +-
 docs/source/models/supported_models.md        |  13 +-
 docs/source/performance/optimization.md       |   2 +-
 docs/source/serving/distributed_serving.md    |  40 +-
 docs/source/serving/integrations/langchain.md |   2 +-
 .../source/serving/integrations/llamaindex.md |   2 +-
 docs/source/serving/metrics.md                |   2 +-
 docs/source/serving/multimodal_inputs.md      |   1 +
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |  19 +-
 format.sh                                     |  10 +-
 pyproject.toml                                |   6 +
 requirements-lint.txt                         |   2 +-
 tools/doc-lint.sh                             |   3 +
 tools/sphinx-lint.sh                          |   3 -
 49 files changed, 596 insertions(+), 571 deletions(-)
 rename .github/workflows/{sphinx-lint.yml => doc-lint.yml} (93%)
 create mode 100755 tools/doc-lint.sh
 delete mode 100755 tools/sphinx-lint.sh

diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/doc-lint.yml
similarity index 93%
rename from .github/workflows/sphinx-lint.yml
rename to .github/workflows/doc-lint.yml
index e0bb24276a653..2f5ee8bbfd8c5 100644
--- a/.github/workflows/sphinx-lint.yml
+++ b/.github/workflows/doc-lint.yml
@@ -13,7 +13,7 @@ on:
       - "docs/**"
 
 jobs:
-  sphinx-lint:
+  doc-lint:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -29,4 +29,4 @@ jobs:
           python -m pip install --upgrade pip
           pip install -r requirements-lint.txt
       - name: Linting docs
-        run: tools/sphinx-lint.sh
+        run: tools/doc-lint.sh
diff --git a/docs/README.md b/docs/README.md
index 46488c9bb0b92..1a44c1341f4fb 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -16,4 +16,5 @@ make html
 ```bash
 python -m http.server -d build/html/
 ```
+
 Launch your browser and open localhost:8000.
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
index b8437e3c3517a..113792147be7c 100644
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@@ -9,4 +9,3 @@ interfaces_base
 interfaces
 adapters
 ```
-
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 9d2af4c13b088..fb93e65673dff 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -6,6 +6,7 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please keep these consistent with README.md. -->
 
 Cash Donations:
+
 - a16z
 - Dropbox
 - Sequoia Capital
@@ -13,6 +14,7 @@ Cash Donations:
 - ZhenFund
 
 Compute Resources:
+
 - AMD
 - Anyscale
 - AWS
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 76ab73e43d24b..99f6a1d5462c4 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -200,6 +200,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 ```{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
+
 :::
 ::::
 
@@ -248,6 +249,7 @@ def get_dummy_processor_inputs(
         mm_data=mm_data,
     )
 ```
+
 :::
 ::::
 
@@ -312,6 +314,7 @@ def _get_mm_fields_config(
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 ```
+
 :::
 ::::
 
@@ -369,6 +372,7 @@ def _get_prompt_replacements(
         ),
     ]
 ```
+
 :::
 ::::
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index c960790f47a13..e92104399342d 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -37,8 +37,6 @@ pytest tests/
 Currently, the repository is not fully checked by `mypy`.
 ```
 
-# Contribution Guidelines
-
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 2df1aca27f1e6..c735bfd0e87a7 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -28,8 +28,8 @@ memory to share data between processes under the hood, particularly for tensor p
 You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
 
 ```console
-$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
 
 ```{note}
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
index be018dfb75d7a..5787c4a407bfb 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr
 To install the Cerebrium client, run:
 
 ```console
-$ pip install cerebrium
-$ cerebrium login
+pip install cerebrium
+cerebrium login
 ```
 
 Next, create your Cerebrium project, run:
 
 ```console
-$ cerebrium init vllm-project
+cerebrium init vllm-project
 ```
 
 Next, to install the required packages, add the following to your cerebrium.toml:
@@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
 Then, run the following code to deploy it to the cloud:
 
 ```console
-$ cerebrium deploy
+cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
 
 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
index 4142c1d9f1f60..b42a34125c6d7 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
 To install dstack client, run:
 
 ```console
-$ pip install "dstack[all]
-$ dstack server
+pip install "dstack[all]
+dstack server
 ```
 
 Next, to configure your dstack project, run:
 
 ```console
-$ mkdir -p vllm-dstack
-$ cd vllm-dstack
-$ dstack init
+mkdir -p vllm-dstack
+cd vllm-dstack
+dstack init
 ```
 
 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index 657e7f2bc72cc..051fc2f2a8d4e 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -334,12 +334,12 @@ run: |
 
 1. Start the chat web UI:
 
-```console
-sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
-```
+    ```console
+    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+    ```
 
 2. Then, we can access the GUI at the returned gradio link:
 
-```console
-| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
-```
+    ```console
+    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+    ```
diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md
index 474d2bdfa9580..a6c3569637abf 100644
--- a/docs/source/deployment/integrations/llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
 To install Llama Stack, run
 
 ```console
-$ pip install llama-stack -q
+pip install llama-stack -q
 ```
 
 ## Inference using OpenAI Compatible API
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 760214e112fba..cbc95c20ff4b3 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -14,234 +14,235 @@ Before you begin, ensure that you have the following:
 
 ## Deployment Steps
 
-1. **Create a PVC , Secret and Deployment for vLLM**
-
-PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
-
-```yaml
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: mistral-7b
-  namespace: default
-spec:
-  accessModes:
-  - ReadWriteOnce
-  resources:
-    requests:
-      storage: 50Gi
-  storageClassName: default
-  volumeMode: Filesystem
-```
-
-Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
-
-```yaml
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token-secret
-  namespace: default
-type: Opaque
-stringData:
-  token: "REPLACE_WITH_TOKEN"
-```
-
-Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
-
-Here are two examples for using NVIDIA GPU and AMD GPU. 
-
-- NVIDIA GPU
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-  namespace: default
-  labels:
-    app: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral-7b
-  template:
-    metadata:
-      labels:
-        app: mistral-7b
-    spec:
-      volumes:
-      - name: cache-volume
-        persistentVolumeClaim:
-          claimName: mistral-7b
-      # vLLM needs to access the host's shared memory for tensor parallel inference.
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: "2Gi"
-      containers:
-      - name: mistral-7b
-        image: vllm/vllm-openai:latest
-        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-        ]
-        env:
-        - name: HUGGING_FACE_HUB_TOKEN
-          valueFrom:
-            secretKeyRef:
-              name: hf-token-secret
-              key: token
-        ports:
-        - containerPort: 8000
+1. Create a PVC, Secret and Deployment for vLLM
+
+      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+      ```yaml
+      apiVersion: v1
+      kind: PersistentVolumeClaim
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
+        accessModes:
+        - ReadWriteOnce
         resources:
-          limits:
-            cpu: "10"
-            memory: 20G
-            nvidia.com/gpu: "1"
           requests:
-            cpu: "2"
-            memory: 6G
-            nvidia.com/gpu: "1"
-        volumeMounts:
-        - mountPath: /root/.cache/huggingface
-          name: cache-volume
-        - name: shm
-          mountPath: /dev/shm
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 8000
-          initialDelaySeconds: 60
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 8000
-          initialDelaySeconds: 60
-          periodSeconds: 5
-```
-
-- AMD GPU
-
-You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-  namespace: default
-  labels:
-    app: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral-7b
-  template:
-    metadata:
-      labels:
-        app: mistral-7b
-    spec:
-      volumes:
-      # PVC
-      - name: cache-volume
-        persistentVolumeClaim:
-          claimName: mistral-7b
-      # vLLM needs to access the host's shared memory for tensor parallel inference.
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: "8Gi"
-      hostNetwork: true
-      hostIPC: true
-      containers:
-      - name: mistral-7b
-        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-        securityContext:
-          seccompProfile:
-            type: Unconfined
-          runAsGroup: 44
-          capabilities:
-            add:
-            - SYS_PTRACE
-        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-        ]
-        env:
-        - name: HUGGING_FACE_HUB_TOKEN
-          valueFrom:
-            secretKeyRef:
-              name: hf-token-secret
-              key: token
+            storage: 50Gi
+        storageClassName: default
+        volumeMode: Filesystem
+      ```
+
+      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+      ```yaml
+      apiVersion: v1
+      kind: Secret
+      metadata:
+        name: hf-token-secret
+        namespace: default
+      type: Opaque
+      stringData:
+        token: "REPLACE_WITH_TOKEN"
+      ```
+
+      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+      Here are two examples for using NVIDIA GPU and AMD GPU.
+
+      NVIDIA GPU:
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "2Gi"
+            containers:
+            - name: mistral-7b
+              image: vllm/vllm-openai:latest
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  nvidia.com/gpu: "1"
+                requests:
+                  cpu: "2"
+                  memory: 6G
+                  nvidia.com/gpu: "1"
+              volumeMounts:
+              - mountPath: /root/.cache/huggingface
+                name: cache-volume
+              - name: shm
+                mountPath: /dev/shm
+              livenessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 10
+              readinessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 5
+      ```
+
+      AMD GPU:
+
+      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            # PVC
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "8Gi"
+            hostNetwork: true
+            hostIPC: true
+            containers:
+            - name: mistral-7b
+              image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+              securityContext:
+                seccompProfile:
+                  type: Unconfined
+                runAsGroup: 44
+                capabilities:
+                  add:
+                  - SYS_PTRACE
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  amd.com/gpu: "1"
+                requests:
+                  cpu: "6"
+                  memory: 6G
+                  amd.com/gpu: "1"
+              volumeMounts:
+              - name: cache-volume
+                mountPath: /root/.cache/huggingface
+              - name: shm
+                mountPath: /dev/shm
+      ```
+
+      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
+2. Create a Kubernetes Service for vLLM
+
+      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+      ```yaml
+      apiVersion: v1
+      kind: Service
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
         ports:
-        - containerPort: 8000
-        resources:
-          limits:
-            cpu: "10"
-            memory: 20G
-            amd.com/gpu: "1"
-          requests:
-            cpu: "6"
-            memory: 6G
-            amd.com/gpu: "1"
-        volumeMounts:
-        - name: cache-volume
-          mountPath: /root/.cache/huggingface
-        - name: shm
-          mountPath: /dev/shm
-```
-You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
-
-2. **Create a Kubernetes Service for vLLM**
-
-Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
-
-```yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: mistral-7b
-  namespace: default
-spec:
-  ports:
-  - name: http-mistral-7b
-    port: 80
-    protocol: TCP
-    targetPort: 8000
-  # The label selector should match the deployment labels & it is useful for prefix caching feature
-  selector:
-    app: mistral-7b
-  sessionAffinity: None
-  type: ClusterIP
-```
-
-3. **Deploy and Test**
-
-Apply the deployment and service configurations using `kubectl apply -f <filename>`:
-
-```console
-kubectl apply -f deployment.yaml
-kubectl apply -f service.yaml
-```
-
-To test the deployment, run the following `curl` command:
-
-```console
-curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-        "model": "mistralai/Mistral-7B-Instruct-v0.3",
-        "prompt": "San Francisco is a",
-        "max_tokens": 7,
-        "temperature": 0
-      }'
-```
-
-If the service is correctly deployed, you should receive a response from the vLLM model.
+        - name: http-mistral-7b
+          port: 80
+          protocol: TCP
+          targetPort: 8000
+        # The label selector should match the deployment labels & it is useful for prefix caching feature
+        selector:
+          app: mistral-7b
+        sessionAffinity: None
+        type: ClusterIP
+      ```
+
+3. Deploy and Test
+
+      Apply the deployment and service configurations using `kubectl apply -f <filename>`:
+
+      ```console
+      kubectl apply -f deployment.yaml
+      kubectl apply -f service.yaml
+      ```
+
+      To test the deployment, run the following `curl` command:
+
+      ```console
+      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+              "model": "mistralai/Mistral-7B-Instruct-v0.3",
+              "prompt": "San Francisco is a",
+              "max_tokens": 7,
+              "temperature": 0
+            }'
+      ```
+
+      If the service is correctly deployed, you should receive a response from the vLLM model.
 
 ## Conclusion
 
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 6d3dd056e6a60..3928e0c16568b 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -6,7 +6,7 @@ The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
-```
+```text
                     Block 1                  Block 2                  Block 3
          [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
 Block 1: |<--- block tokens ---->|
@@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->|
 Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
 ```
 
-
 In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
 
-```
+```text
 hash(prefix tokens + block tokens) <--> KV Block
 ```
 
 With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
 
-
 This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
 
-
 ## Generalized Caching Policy
 
 Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
@@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra
 
 However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
 
-- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
-- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
+* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
+* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 3679595e3d4d0..404505eb3890e 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage.
 You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
 
 ```console
-$ pip install autoawq
+pip install autoawq
 ```
 
 After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index f7f41726f3725..7525e8e7866c3 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
 
 ```console
-$ pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.0
 ```
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
@@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and
 You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
 And usually, these repositories have a config.json file that includes a quantization_config section.
 
-## Read quantized checkpoint.
+## Read quantized checkpoint
 
 ```python
 from vllm import LLM
@@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
 quantization="bitsandbytes", load_format="bitsandbytes")
 ```
+
 ## OpenAI Compatible Server
 
 Append the following to your 4bit model arguments:
 
-```
+```console
 --quantization bitsandbytes --load-format bitsandbytes
 ```
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index b2eda74fd1e3b..da49cd2747228 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b
 To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
 
 ```console
-$ pip install llmcompressor
+pip install llmcompressor
 ```
 
 ## Quantization Process
@@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR)
 Install `vllm` and `lm-evaluation-harness`:
 
 ```console
-$ pip install vllm lm-eval==0.4.4
+pip install vllm lm-eval==0.4.4
 ```
 
 Load and run the model in `vllm`:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index 50edaf81fddd3..1cd67cb8fd336 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
 To install AMMO (AlgorithMic Model Optimization):
 
 ```console
-$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
 ```
 
 Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index eebf11dfc1b2b..640997cf4bc39 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
 ```console
-$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
-$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
 ```
 
 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
 
 ```console
-$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
 
 ```{warning}
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 1ac50ba987dda..82a15d76d352f 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi
 To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
 
 ```console
-$ pip install llmcompressor
+pip install llmcompressor
 ```
 
 ## Quantization Process
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 903acadb71426..ab7b2f302bd13 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models:
 
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
    used directly with vLLM due to differences in the expected layer names and model definition.
-   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d)
    to convert them. Note that this script does not modify the model's weights.
 
    In the above example, use the script to first convert
-   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model
    and then use the converted checkpoint as the draft model in vLLM.
 
 2. The EAGLE based draft models need to be run without tensor parallelism
@@ -207,7 +207,6 @@ A few important things to consider when using the EAGLE based draft models:
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
    investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
 
-
 A variety of EAGLE draft models are available on the Hugging Face hub:
 
 | Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
@@ -224,7 +223,6 @@ A variety of EAGLE draft models are available on the Hugging Face hub:
 | Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
 | Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
 
-
 ## Lossless guarantees of Speculative Decoding
 
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
@@ -250,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas:
    same request across runs. For more details, see the FAQ section
    titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
-**Conclusion**
-
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
 can occur due to following factors:
 
@@ -259,8 +255,6 @@ can occur due to following factors:
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
   due to non-deterministic behavior in batched operations or numerical instability.
 
-**Mitigation Strategies**
-
 For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 ## Resources for vLLM contributors
diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 062f2021eb62a..027ddb6d5eda3 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
 ```
 
 Example output:
-```
+
+```text
 Function called: get_weather
 Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"}
 Result: Getting the weather for San Francisco, CA in fahrenheit...
 ```
 
 This example demonstrates:
-- Setting up the server with tool calling enabled
-- Defining an actual function to handle tool calls
-- Making a request with `tool_choice="auto"`
-- Handling the structured response and executing the corresponding function
+
+* Setting up the server with tool calling enabled
+* Defining an actual function to handle tool calls
+* Making a request with `tool_choice="auto"`
+* Handling the structured response and executing the corresponding function
 
 You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
 
 Remember that it's the callers responsibility to:
+
 1. Define appropriate tools in the request
 2. Include relevant context in the chat messages
 3. Handle the tool calls in your application logic
@@ -77,20 +80,21 @@ Remember that it's the callers responsibility to:
 For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below.
 
 ## Named Function Calling
+
 vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
 enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. 
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
 
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
-
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
+
 * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
 deems appropriate.
 * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
@@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso
 
 If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
 
-
 ### Hermes Models (`hermes`)
 
 All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
+
 * `NousResearch/Hermes-2-Pro-*`
 * `NousResearch/Hermes-2-Theta-*`
 * `NousResearch/Hermes-3-*`
 
-
 _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
 step in their creation_.
 
 Flags: `--tool-call-parser hermes`
 
-
 ### Mistral Models (`mistral`)
 
 Supported models:
+
 * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
 * Additional mistral function-calling models are compatible as well.
 
 Known issues:
+
 1. Mistral 7B struggles to generate parallel tool calls correctly.
 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
 much shorter than what vLLM generates. Since an exception is thrown when this condition
@@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated
 * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
 when tools are provided, that results in much better reliability when working with parallel tool calling.
 
-
 Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
-
 ### Llama Models (`llama3_json`)
 
 Supported models:
+
 * `meta-llama/Meta-Llama-3.1-8B-Instruct`
 * `meta-llama/Meta-Llama-3.1-70B-Instruct`
 * `meta-llama/Meta-Llama-3.1-405B-Instruct`
@@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
+
 1. Parallel tool calls are not supported.
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
@@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool
 #### IBM Granite
 
 Supported models:
+
 * `ibm-granite/granite-3.0-8b-instruct`
 
 Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
@@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t
 
 `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
 
-
 ### InternLM Models (`internlm`)
 
 Supported models:
+
 * `internlm/internlm2_5-7b-chat` (confirmed)
 * Additional internlm2.5 function-calling models are compatible as well
 
 Known issues:
+
 * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
 
 Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
 
-
 ### Jamba Models (`jamba`)
+
 AI21's Jamba-1.5 models are supported.
+
 * `ai21labs/AI21-Jamba-1.5-Mini`
 * `ai21labs/AI21-Jamba-1.5-Large`
 
-
 Flags: `--tool-call-parser jamba`
 
-
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
 
 As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
+
 ```python
 [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
 ```
 
 Limitations:
+
 * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
 * Llama's smaller models struggle to use tools effectively.
 
 Example supported models:
+
 * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
@@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format.
 
 ---
 
-
 ## How to write a tool parser plugin
 
 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
@@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser):
 ```
 
 Then you can use this plugin in the command line like this.
-```
+
+```console
     --enable-auto-tool-choice \
     --tool-parser-plugin <absolute path of the plugin file>
     --tool-call-parser example \
diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md
index fde2954f10c59..4751b325e6fc4 100644
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin
 can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 
-**Mitigation Strategies**
+## Mitigation Strategies
 
 - For improved stability and reduced variance, use `float32`. Note that this will require more memory.
 - If using `bfloat16`, switching to `float16` can also help.
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
index b55e4384d064d..1068893f5bafa 100644
--- a/docs/source/getting_started/installation/cpu-apple.md
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -18,25 +18,23 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
 
-```
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -r requirements-cpu.txt
-$ pip install -e . 
+```console
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements-cpu.txt
+pip install -e . 
 ```
 
 ```{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
 ```
 
-
-
 ## Troubleshooting
 
-If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
 [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
 
-```
+```text
 [...] fatal error: 'map' file not found
           1 | #include <map>
             |          ^~~~~
@@ -48,4 +46,3 @@ If the build has error like the following snippet where standard C++ headers can
             |          ^~~~~~~~~
       1 error generated.
 ```
-
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index 26bdcd93ad190..c49c8e0f2a18c 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -32,13 +32,13 @@ Table of contents:
 ## Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
+docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+docker run -it \
+           --rm \
+           --network=host \
+           --cpuset-cpus=<cpu-id-list, optional> \
+           --cpuset-mems=<memory-node, optional> \
+           vllm-cpu-env
 ```
 
 (build-cpu-backend-from-source)=
@@ -48,23 +48,23 @@ $ docker run -it \
 - First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 
 ```console
-$ sudo apt-get update  -y
-$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+sudo apt-get update  -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
 - Second, install Python packages for vLLM CPU backend building:
 
 ```console
-$ pip install --upgrade pip
-$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install --upgrade pip
+pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 - Finally, build and install vLLM CPU backend:
 
 ```console
-$ VLLM_TARGET_DEVICE=cpu python setup.py install
+VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 
 ```{note}
@@ -92,18 +92,18 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
 
 ```console
-$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
-$ find / -name *libtcmalloc* # find the dynamic link library path
-$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/basic.py # run vLLM
+sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+find / -name *libtcmalloc* # find the dynamic link library path
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+python examples/offline_inference/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
 
 ```console
-$ export VLLM_CPU_KVCACHE_SPACE=40
-$ export VLLM_CPU_OMP_THREADS_BIND=0-29
-$ vllm serve facebook/opt-125m
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_OMP_THREADS_BIND=0-29
+vllm serve facebook/opt-125m
 ```
 
 - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
@@ -148,7 +148,7 @@ $ python examples/offline_inference/basic.py
   - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
-    $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
   - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 419b8163fc034..727486abbd10f 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -17,9 +17,9 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
 You can create a new Python environment using `conda`:
 
 ```console
-$ # (Recommended) Create a new conda environment.
-$ conda create -n myenv python=3.12 -y
-$ conda activate myenv
+# (Recommended) Create a new conda environment.
+conda create -n myenv python=3.12 -y
+conda activate myenv
 ```
 
 ```{note}
@@ -29,9 +29,9 @@ $ conda activate myenv
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
 
 ```console
-$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-$ uv venv myenv --python 3.12 --seed
-$ source myenv/bin/activate
+# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
 ```
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
@@ -43,18 +43,18 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm # If you are using pip.
-$ uv pip install vllm # If you are using uv.
+# Install vLLM with CUDA 12.1.
+pip install vllm # If you are using pip.
+uv pip install vllm # If you are using uv.
 ```
 
 As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
 
 ```console
-$ # Install vLLM with CUDA 11.8.
-$ export VLLM_VERSION=0.6.1.post1
-$ export PYTHON_VERSION=310
-$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+# Install vLLM with CUDA 11.8.
+export VLLM_VERSION=0.6.1.post1
+export PYTHON_VERSION=310
+pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
 (install-the-latest-code)=
@@ -66,7 +66,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ### Install the latest code using `pip`
 
 ```console
-$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
@@ -74,8 +74,8 @@ $ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
-$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -85,14 +85,14 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 
 ```console
-$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
-$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -102,8 +102,8 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb
 Another way to access the latest code is to use the docker images:
 
 ```console
-$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
 ```
 
 These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
@@ -121,18 +121,18 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
 If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ VLLM_USE_PRECOMPILED=1 pip install --editable .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
 ```console
-$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-$ pip install --editable .
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+pip install --editable .
 ```
 
 You can find more information about vLLM's wheels [above](#install-the-latest-code).
@@ -147,9 +147,9 @@ It is recommended to use the same commit ID for the source code as the vLLM whee
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -e .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -e .
 ```
 
 ```{tip}
@@ -172,11 +172,11 @@ There are scenarios where the PyTorch dependency cannot be easily installed via
 To build vLLM using an existing PyTorch installation:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ python use_existing_torch.py
-$ pip install -r requirements-build.txt
-$ pip install -e . --no-build-isolation
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+python use_existing_torch.py
+pip install -r requirements-build.txt
+pip install -e . --no-build-isolation
 ```
 
 #### Use the local cutlass for compilation
@@ -185,9 +185,9 @@ Currently, before starting the build process, vLLM fetches cutlass code from Git
 To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
 ```
 
 #### Troubleshooting
@@ -196,8 +196,8 @@ To avoid your system being overloaded, you can limit the number of compilation j
 to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
 
 ```console
-$ export MAX_JOBS=6
-$ pip install -e .
+export MAX_JOBS=6
+pip install -e .
 ```
 
 This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
@@ -206,22 +206,22 @@ A side effect is a much slower build process.
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
 
 ```console
-$ # Use `--ipc=host` to make sure the shared memory is large enough.
-$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+# Use `--ipc=host` to make sure the shared memory is large enough.
+docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
 ```
 
 If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
 
 ```console
-$ export CUDA_HOME=/usr/local/cuda
-$ export PATH="${CUDA_HOME}/bin:$PATH"
+export CUDA_HOME=/usr/local/cuda
+export PATH="${CUDA_HOME}/bin:$PATH"
 ```
 
 Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 
 ```console
-$ nvcc --version # verify that nvcc is in your PATH
-$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+nvcc --version # verify that nvcc is in your PATH
+${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
 ```
 
 ### Unsupported OS build
@@ -231,6 +231,6 @@ vLLM can fully run only on Linux but for development purposes, you can still bui
 Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 
 ```console
-$ export VLLM_TARGET_DEVICE=empty
-$ pip install -e .
+export VLLM_TARGET_DEVICE=empty
+pip install -e .
 ```
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
index e36b92513e31d..a8971bb96248c 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -47,13 +47,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
 
 ```console
-$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
 
 ```console
-$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
@@ -83,81 +83,81 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
 - [PyTorch](https://pytorch.org/)
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
 
-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
 
-Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
 
-```console
-$ python3 -m pip install ninja cmake wheel pybind11
-$ pip uninstall -y triton
-$ git clone https://github.com/OpenAI/triton.git
-$ cd triton
-$ git checkout e192dba
-$ cd python
-$ pip3 install .
-$ cd ../..
-```
+    ```console
+    python3 -m pip install ninja cmake wheel pybind11
+    pip uninstall -y triton
+    git clone https://github.com/OpenAI/triton.git
+    cd triton
+    git checkout e192dba
+    cd python
+    pip3 install .
+    cd ../..
+    ```
 
-```{note}
-- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-```
+    ```{note}
+    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    ```
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
-Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
-Alternatively, wheels intended for vLLM use can be accessed under the releases.
+    Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+    Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
-For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+    For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
 
-```console
-$ git clone https://github.com/ROCm/flash-attention.git
-$ cd flash-attention
-$ git checkout 3cea2fb
-$ git submodule update --init
-$ GPU_ARCHS="gfx90a" python3 setup.py install
-$ cd ..
-```
+    ```console
+    git clone https://github.com/ROCm/flash-attention.git
+    cd flash-attention
+    git checkout 3cea2fb
+    git submodule update --init
+    GPU_ARCHS="gfx90a" python3 setup.py install
+    cd ..
+    ```
 
-```{note}
-- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-```
+    ```{note}
+    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    ```
 
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
 
-```bash
-$ pip install --upgrade pip
+    ```bash
+    $ pip install --upgrade pip
 
-# Install PyTorch
-$ pip uninstall torch -y
-$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+    # Install PyTorch
+    $ pip uninstall torch -y
+    $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 
-# Build & install AMD SMI
-$ pip install /opt/rocm/share/amd_smi
+    # Build & install AMD SMI
+    $ pip install /opt/rocm/share/amd_smi
 
-# Install dependencies
-$ pip install --upgrade numba scipy huggingface-hub[cli]
-$ pip install "numpy<2"
-$ pip install -r requirements-rocm.txt
+    # Install dependencies
+    $ pip install --upgrade numba scipy huggingface-hub[cli]
+    $ pip install "numpy<2"
+    $ pip install -r requirements-rocm.txt
 
-# Build vLLM for MI210/MI250/MI300.
-$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-$ python3 setup.py develop
-```
+    # Build vLLM for MI210/MI250/MI300.
+    $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+    $ python3 setup.py develop
+    ```
 
-This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
+    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-```{tip}
-- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
-- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-```
+    ```{tip}
+    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+    - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+    ```
 
-```{tip}
-- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+    ```{tip}
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+    ```
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 21822327c8822..a829b1c9ff996 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -22,8 +22,8 @@ Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optim
 ### Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
 ```{tip}
@@ -37,10 +37,10 @@ If you're observing the following error: `docker: Error response from daemon: Un
 To verify that the Intel Gaudi software was correctly installed, run:
 
 ```console
-$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-$ pip list | grep neural # verify that neural_compressor is installed
+hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+pip list | grep neural # verify that neural_compressor is installed
 ```
 
 Refer to [Intel Gaudi Software Stack
@@ -57,8 +57,8 @@ for more details.
 Use the following commands to run a Docker image:
 
 ```console
-$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
 #### Build and Install vLLM
@@ -66,18 +66,18 @@ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_
 To build and install vLLM from source, run:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ python setup.py develop
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+python setup.py develop
 ```
 
 Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
 
 ```console
-$ git clone https://github.com/HabanaAI/vllm-fork.git
-$ cd vllm-fork
-$ git checkout habana_main
-$ python setup.py develop
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork
+git checkout habana_main
+python setup.py develop
 ```
 
 ## Supported Features
@@ -181,7 +181,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
-```
+```text
 INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
@@ -192,7 +192,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1
 
 Example (with ramp-up)
 
-```
+```text
 min = 2, step = 32, max = 64
 => ramp_up = (2, 4, 8, 16)
 => stable = (32, 64)
@@ -201,7 +201,7 @@ min = 2, step = 32, max = 64
 
 Example (without ramp-up)
 
-```
+```text
 min = 128, step = 128, max = 512
 => ramp_up = ()
 => stable = (128, 256, 384, 512)
@@ -224,7 +224,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
-```
+```text
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
 INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
@@ -273,7 +273,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
-```
+```text
 INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
@@ -349,19 +349,19 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
   - Default values:
 
     - Prompt:
-      : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-        - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-        - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
-        - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
-        - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-        - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
     - Decode:
-      : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
-        - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-        - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
-        - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
-        - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
-        - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+      - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+      - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+      - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+      - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md
index 431f90537f543..5581b1940ca46 100644
--- a/docs/source/getting_started/installation/neuron.md
+++ b/docs/source/getting_started/installation/neuron.md
@@ -123,10 +123,10 @@ python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torch
 Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -U -r requirements-neuron.txt
-$ VLLM_TARGET_DEVICE="neuron" pip install .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -U -r requirements-neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
 If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md
index 60f95fd1c4250..d97d4173bf36b 100644
--- a/docs/source/getting_started/installation/openvino.md
+++ b/docs/source/getting_started/installation/openvino.md
@@ -27,8 +27,8 @@ vLLM powered by OpenVINO supports all LLM models from [vLLM supported models lis
 ## Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
-$ docker run -it --rm vllm-openvino-env
+docker build -f Dockerfile.openvino -t vllm-openvino-env .
+docker run -it --rm vllm-openvino-env
 ```
 
 (install-openvino-backend-from-source)=
@@ -38,21 +38,21 @@ $ docker run -it --rm vllm-openvino-env
 - First, install Python. For example, on Ubuntu 22.04, you can run:
 
   ```console
-  $ sudo apt-get update  -y
-  $ sudo apt-get install python3
+  sudo apt-get update  -y
+  sudo apt-get install python3
   ```
 
 - Second, install prerequisites vLLM OpenVINO backend installation:
 
   ```console
-  $ pip install --upgrade pip
-  $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+  pip install --upgrade pip
+  pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
   ```
 
 - Finally, install vLLM with OpenVINO backend:
 
   ```console
-  $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+  PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
   ```
 
 - [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md
index bc93c44fead30..1938785ade46a 100644
--- a/docs/source/getting_started/installation/tpu.md
+++ b/docs/source/getting_started/installation/tpu.md
@@ -156,14 +156,14 @@ For more information about using TPUs with GKE, see
 You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
-$ docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f Dockerfile.tpu -t vllm-tpu .
 ```
 
 Run the Docker image with the following command:
 
 ```console
-$ # Make sure to add `--privileged --net host --shm-size=16G`.
-$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+# Make sure to add `--privileged --net host --shm-size=16G`.
+docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
 ```{note}
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index c1ab5478eb652..73758f37cf0f6 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -40,15 +40,15 @@ $ docker run -it \
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
-$ source /opt/intel/oneapi/setvars.sh
-$ pip install --upgrade pip
-$ pip install -v -r requirements-xpu.txt
+source /opt/intel/oneapi/setvars.sh
+pip install --upgrade pip
+pip install -v -r requirements-xpu.txt
 ```
 
 - Finally, build and install vLLM XPU backend:
 
 ```console
-$ VLLM_TARGET_DEVICE=xpu python setup.py install
+VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
 ```{note}
@@ -61,14 +61,14 @@ $ VLLM_TARGET_DEVICE=xpu python setup.py install
 XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 
 ```console
-$ python -m vllm.entrypoints.openai.api_server \
-$      --model=facebook/opt-13b \
-$      --dtype=bfloat16 \
-$      --device=xpu \
-$      --max_model_len=1024 \
-$      --distributed-executor-backend=ray \
-$      --pipeline-parallel-size=2 \
-$      -tp=8
+python -m vllm.entrypoints.openai.api_server \
+     --model=facebook/opt-13b \
+     --dtype=bfloat16 \
+     --device=xpu \
+     --max_model_len=1024 \
+     --distributed-executor-backend=ray \
+     --pipeline-parallel-size=2 \
+     -tp=8
 ```
 
 By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6fd0083a9bb7b..8ac80e5e5c553 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -19,17 +19,17 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/
 It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
 
 ```console
-$ uv venv myenv --python 3.12 --seed
-$ source myenv/bin/activate
-$ uv pip install vllm
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
+uv pip install vllm
 ```
 
 You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.12 -y
-$ conda activate myenv
-$ pip install vllm
+conda create -n myenv python=3.12 -y
+conda activate myenv
+pip install vllm
 ```
 
 ```{note}
@@ -94,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the
 Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
 
 ```console
-$ vllm serve Qwen/Qwen2.5-1.5B-Instruct
+vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
 ```{note}
@@ -105,7 +105,7 @@ You can learn about overriding it [here](#chat-template).
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
 ```console
-$ curl http://localhost:8000/v1/models
+curl http://localhost:8000/v1/models
 ```
 
 You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
@@ -115,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY`
 Once your server is started, you can query the model with input prompts:
 
 ```console
-$ curl http://localhost:8000/v1/completions \
-$     -H "Content-Type: application/json" \
-$     -d '{
-$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-$         "prompt": "San Francisco is a",
-$         "max_tokens": 7,
-$         "temperature": 0
-$     }'
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'
 ```
 
 Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
@@ -151,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter
 You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
 
 ```console
-$ curl http://localhost:8000/v1/chat/completions \
-$     -H "Content-Type: application/json" \
-$     -d '{
-$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-$         "messages": [
-$             {"role": "system", "content": "You are a helpful assistant."},
-$             {"role": "user", "content": "Who won the world series in 2020?"}
-$         ]
-$     }'
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"}
+        ]
+    }'
 ```
 
 Alternatively, you can use the `openai` Python package:
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index f5efe0bef7506..1e290d2b4c0bd 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
 (troubleshooting-incorrect-hardware-driver)=
+
 ## Incorrect hardware/driver
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@@ -118,13 +119,13 @@ dist.destroy_process_group()
 If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
 
 ```console
-$ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
+NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
 ```
 
 If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
 
 ```console
-$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
 ```
 
 If the script runs successfully, you should see the message `sanity check is successful!`.
@@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup
 ```
 
 (troubleshooting-python-multiprocessing)=
+
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
diff --git a/docs/source/index.md b/docs/source/index.md
index de74276a50fb6..8f9493d77186e 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,4 +1,4 @@
-# Welcome to vLLM!
+# Welcome to vLLM
 
 ```{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
@@ -186,7 +186,7 @@ community/meetups
 community/sponsors
 ```
 
-# Indices and tables
+## Indices and tables
 
 - {ref}`genindex`
 - {ref}`modindex`
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index fe2701194a604..75f7a9fcad416 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea
 You first need to install vLLM RunAI optional dependency:
 
 ```console
-$ pip3 install vllm[runai]
+pip3 install vllm[runai]
 ```
 
 To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
 ```
 
 To run model from AWS S3 object store run:
 
 ```console
-$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
 ```
 
 To run model from a S3 compatible object store run:
 
 ```console
-$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
 ```
 
 ## Tunable parameters
@@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of
 For reading from S3, it will be the number of client instances the host is opening to the S3 server.
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
 ```
 
 You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
 ```{note}
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 72910ea1e2d19..5c96dfdad25f7 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
 
 ```shell
-$ export VLLM_USE_MODELSCOPE=True
+export VLLM_USE_MODELSCOPE=True
 ```
 
 And use with `trust_remote_code=True`.
@@ -820,19 +820,22 @@ The following table lists those that are tested in vLLM.
 
 _________________
 
-# Model Support Policy
+## Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
+
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-```{tip}
-When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-```
+    ```{tip}
+    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    ```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
+
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
+
 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
 
 Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 4fcde9b03b887..4fbc376e1aa39 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w
 The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, the following warning is printed:
 
-```
+```text
 WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 4e0a9ef6ecf7d..daf6e2f250416 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a")
 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
 
 ```console
-$ vllm serve facebook/opt-13b \
-$     --tensor-parallel-size 4
+ vllm serve facebook/opt-13b \
+     --tensor-parallel-size 4
 ```
 
 You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 ```console
-$ vllm serve gpt2 \
-$     --tensor-parallel-size 4 \
-$     --pipeline-parallel-size 2
+ vllm serve gpt2 \
+     --tensor-parallel-size 4 \
+     --pipeline-parallel-size 2
 ```
 
 ## Running vLLM on multiple nodes
@@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have
 Pick a node as the head node, and run the following command:
 
 ```console
-$ bash run_cluster.sh \
-$                   vllm/vllm-openai \
-$                   ip_of_head_node \
-$                   --head \
-$                   /path/to/the/huggingface/home/in/this/node
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --head \
+                /path/to/the/huggingface/home/in/this/node
 ```
 
 On the rest of the worker nodes, run the following command:
 
 ```console
-$ bash run_cluster.sh \
-$                   vllm/vllm-openai \
-$                   ip_of_head_node \
-$                   --worker \
-$                   /path/to/the/huggingface/home/in/this/node
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --worker \
+                /path/to/the/huggingface/home/in/this/node
 ```
 
 Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
@@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container,
 After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
-$ vllm serve /path/to/the/model/in/the/container \
-$     --tensor-parallel-size 8 \
-$     --pipeline-parallel-size 2
+ vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 8 \
+     --pipeline-parallel-size 2
 ```
 
 You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
 
 ```console
-$ vllm serve /path/to/the/model/in/the/container \
-$     --tensor-parallel-size 16
+vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 16
 ```
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md
index 49ff6e0c32a72..03142d23b145a 100644
--- a/docs/source/serving/integrations/langchain.md
+++ b/docs/source/serving/integrations/langchain.md
@@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain
 To install LangChain, run
 
 ```console
-$ pip install langchain langchain_community -q
+pip install langchain langchain_community -q
 ```
 
 To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md
index 9961c181d7e1c..8c72605202cf5 100644
--- a/docs/source/serving/integrations/llamaindex.md
+++ b/docs/source/serving/integrations/llamaindex.md
@@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index
 To install LlamaIndex, run
 
 ```console
-$ pip install llama-index-llms-vllm -q
+pip install llama-index-llms-vllm -q
 ```
 
 To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index e6ded2e6dd465..6c84f6d1350a6 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -7,7 +7,7 @@ OpenAI compatible API server.
 You can start the server using Python, or using [Docker](#deployment-docker):
 
 ```console
-$ vllm serve unsloth/Llama-3.2-1B-Instruct
+vllm serve unsloth/Llama-3.2-1B-Instruct
 ```
 
 Then query the endpoint to get the latest metrics from the server:
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 53f5a274e39a3..0213b0a3388ea 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
 ```
 
 Then, you can use the OpenAI client as follows:
+
 ```python
 from openai import OpenAI
 
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 79092ab208784..94703a1c32ade 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -64,7 +64,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her
 
 #### Context length and batch size
 
-You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
+You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
 and the maximum batch size (`max_num_seqs` option).
 
 ```python
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index ec5a367594743..e49bbb06695f8 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -5,11 +5,13 @@
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
 
 You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
+
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -50,6 +52,7 @@ In addition, we have the following custom APIs:
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
+
 ## Chat Template
 
 In order for the language model to support chat protocol, vLLM requires the model to include
@@ -71,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find
 
 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
+
 ```python
 completion = client.chat.completions.create(
   model="NousResearch/Meta-Llama-3-8B-Instruct",
@@ -80,7 +84,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like
 `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
 request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
 *"Detected the chat template content format to be..."*, and internally converts incoming requests to match
@@ -115,12 +119,12 @@ completion = client.chat.completions.create(
 ## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
-with `--enable-request-id-headers`. 
+with `--enable-request-id-headers`.
 
 > Note that enablement of the headers can impact performance significantly at high QPS
 > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
 > rather than within the vLLM layer for this reason.
-> See https://github.com/vllm-project/vllm/pull/11529 for more details.
+> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
 
 ```python
 completion = client.chat.completions.create(
@@ -147,6 +151,7 @@ print(completion._request_id)
 ## CLI Reference
 
 (vllm-serve)=
+
 ### `vllm serve`
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
@@ -175,7 +180,7 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-$ vllm serve SOME_MODEL --config config.yaml
+vllm serve SOME_MODEL --config config.yaml
 ```
 
 ```{note}
@@ -186,6 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 ## API Reference
 
 (completions-api)=
+
 ### Completions API
 
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
@@ -212,6 +218,7 @@ The following extra parameters are supported:
 ```
 
 (chat-api)=
+
 ### Chat API
 
 Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
@@ -243,6 +250,7 @@ The following extra parameters are supported:
 ```
 
 (embeddings-api)=
+
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
@@ -284,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ```
 
 (tokenizer-api)=
+
 ### Tokenizer API
 
 Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
@@ -293,6 +302,7 @@ It consists of two endpoints:
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
 (pooling-api)=
+
 ### Pooling API
 
 Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
@@ -302,6 +312,7 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu
 Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
+
 ### Score API
 
 Our Score API applies a cross-encoder model to predict scores for sentence pairs.
diff --git a/format.sh b/format.sh
index 0b196de9d0773..2277eef93c745 100755
--- a/format.sh
+++ b/format.sh
@@ -41,7 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
 ISORT_VERSION=$(isort --vn)
 CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
-SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}')
+PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}')
 
 # # params: tool name, tool version, required version
 tool_version_check() {
@@ -58,7 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION"
 tool_version_check "isort" "$ISORT_VERSION"
 tool_version_check "codespell" "$CODESPELL_VERSION"
 tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
-tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION"
+tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION"
 
 YAPF_FLAGS=(
     '--recursive'
@@ -316,6 +316,6 @@ else
     echo "✨🎉 Format check passed! Congratulations! 🎉✨"
 fi
 
-echo 'vLLM sphinx-lint:'
-tools/sphinx-lint.sh
-echo 'vLLM sphinx-lint: Done'
+echo 'vLLM doc-lint:'
+tools/doc-lint.sh
+echo 'vLLM doc-lint: Done'
diff --git a/pyproject.toml b/pyproject.toml
index 0ac3f39ef7a5f..82275ccafb572 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,3 +101,9 @@ markers = [
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
+
+[tool.pymarkdown]
+plugins.md013.enabled = false # line-length
+plugins.md041.enabled = false # first-line-h1
+plugins.md033.enabled = false # inline-html
+plugins.md024.allow_different_nesting = true # no-duplicate-headers
diff --git a/requirements-lint.txt b/requirements-lint.txt
index 711bb50a0e936..ffc73f90a0d48 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -6,7 +6,7 @@ ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
-sphinx-lint==1.0.0
+pymarkdownlnt==0.9.26
 
 # type checking
 mypy==1.11.1
diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh
new file mode 100755
index 0000000000000..19a55ddfa91c4
--- /dev/null
+++ b/tools/doc-lint.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+pymarkdownlnt scan docs -r
diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh
deleted file mode 100755
index 04f8075c5527f..0000000000000
--- a/tools/sphinx-lint.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-sphinx-lint --disable trailing-whitespace,missing-final-newline docs

From f967e51f386404c7ead21d3c59ddc195cf946975 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Jan 2025 16:17:24 +0800
Subject: [PATCH 092/115] [Model] Initialize support for Deepseek-VL2 models
 (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 docs/source/models/supported_models.md        |  20 +-
 examples/offline_inference/vision_language.py |  18 +
 .../vision_language_multi_image.py            |  23 +
 .../vision_language/test_models.py            |  27 +
 .../vision_language/vlm_utils/model_utils.py  |  36 +
 tests/models/registry.py                      |   2 +
 tests/models/test_initialization.py           |   3 +
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/model_executor/models/deepseek_v2.py     |  18 +-
 vllm/model_executor/models/deepseek_v3.py     |  20 +-
 vllm/model_executor/models/deepseek_vl2.py    | 662 ++++++++++++++++++
 vllm/model_executor/models/minicpmv.py        |   2 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/config.py             |   6 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 .../configs/deepseek_vl2.py                   | 214 ++++++
 17 files changed, 1050 insertions(+), 9 deletions(-)
 create mode 100644 vllm/model_executor/models/deepseek_vl2.py
 create mode 100644 vllm/transformers_utils/configs/deepseek_vl2.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cf82210f96ee3..393912881bca3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -52,6 +52,7 @@ steps:
   - tests/worker
   - tests/standalone_tests/lazy_torch_compile.py
   commands:
+  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
   - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5c96dfdad25f7..642ef3c9655b8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -610,6 +610,13 @@ See [this page](#generative-models) for more information on how to use generativ
   -
   - ✅︎
   - ✅︎
+* - `DeepseekVLV2ForCausalLM`
+  - DeepSeek-VL2
+  - T + I<sup>+</sup>
+  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  -
+  - ✅︎
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
@@ -755,8 +762,19 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
+````{note}
+The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
+
+To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
+```shell
+pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
+```
+
+Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+````
+
 ```{note}
-To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b51bfae455267..ad32b9fe242e9 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -66,6 +66,23 @@ def run_chameleon(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Deepseek-VL2
+def run_deepseek_vl2(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "deepseek-ai/deepseek-vl2-small"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+
+    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
 def run_fuyu(question: str, modality: str):
     assert modality == "image"
@@ -498,6 +515,7 @@ def run_qwen2_vl(question: str, modality: str):
     "aria": run_aria,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
+    "deepseek_vl_v2": run_deepseek_vl2,
     "fuyu": run_fuyu,
     "glm4v": run_glm4v,
     "h2ovl_chat": run_h2ovl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index cf2e90a325c6a..c6cf3f30c31cb 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -54,6 +54,28 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_deepseek_vl2(question: str, image_urls: List[str]):
+    model_name = "deepseek-ai/deepseek-vl2-small"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=2,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+              limit_mm_per_prompt={"image": len(image_urls)})
+
+    placeholder = "".join(f"image_{i}:<image>\n"
+                          for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-2b"
 
@@ -372,6 +394,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
+    "deepseek_vl2": load_deepseek_vl2,
     "h2ovl_chat": load_h2onvl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 146685738a1d0..7620ed1107e8f 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -188,6 +188,33 @@
         max_tokens=8,
         dtype="bfloat16",
     ),
+    "deepseek_vl_v2": VLMTestInfo(
+        models=["deepseek-ai/deepseek-vl2-small"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        dtype="bfloat16",
+        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
+            "cherry_blossom": "<image>\nWhat's the color of the tower?",
+        }),
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
+        image_size_factors=[(0.10, 0.15)],
+        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
+        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
+        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
+        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
+        num_logprobs=5,
+        marks=[
+            pytest.mark.skipif(
+                not is_flash_attn_2_available(),
+                reason="Model needs flash-attn for numeric convergence.",
+            ),
+            large_gpu_mark(min_gb=48),
+        ],
+    ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
         test_type=VLMTestType.IMAGE,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 6c7a753af787e..1ca85c7bb2056 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -183,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<｜end▁of▁sentence｜>"):
+        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
+    return output_ids, output_str, out_logprobs
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
@@ -261,6 +269,34 @@ def qwen_prompt_path_encoder(
 
 
 ####### Model-specific HuggingFace runner patchers
+def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, text="", images=None, **kwargs):
+        if isinstance(images, Image):
+            images = [images]
+        # inputs is a custom class instead of dict or BatchFeature
+        inputs = hf_processor(
+            *args,
+            prompt=text,
+            images=images,
+            **kwargs,
+        )
+        inputs = {
+            k: inputs[k]
+            for k in inputs.keys()  # noqa
+            if k not in ("seq_lens", "sft_format")
+        }
+        inputs = BatchEncoding(data=inputs, tensor_type="pt")
+        return inputs
+
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language.model.embed_tokens
+    return hf_model
+
+
 def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4."""
     hf_processor = hf_model.processor
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f5aaa8eb071f9..d079725b2f78d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -179,6 +179,8 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
+    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 7a564c1f4a1d0..daece7c93c0ef 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -26,6 +26,9 @@ def test_can_initialize(model_arch):
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
+        if hf_config.model_type == "deepseek_vl_v2":
+            hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
+
         if hasattr(hf_config, "text_config"):
             text_config: PretrainedConfig = hf_config.text_config
         else:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 923c7459f6948..beedf5d16ab86 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,8 +403,8 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat", "NVLM_D",
-                              "h2ovl_chat"):
+            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
+                              "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 4cf4e6c358bf2..9132040545863 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -243,7 +243,11 @@ def __init__(
                                         bias=False,
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+            self.use_normal_rope = False
+        else:
+            self.use_normal_rope = True
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
@@ -298,7 +302,18 @@ def forward(
                      self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        if self.use_normal_rope:
+            seq_len = positions.size(0)
+            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+            q_pe = q_pe.reshape(seq_len, -1)
+            k_pe = k_pe.reshape(seq_len, -1)
+
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        if self.use_normal_rope:
+            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
         q[..., self.qk_nope_head_dim:] = q_pe
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
@@ -355,6 +370,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
+
         if (config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0):
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index d4710622681b5..ca79b14c55fea 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -251,7 +251,11 @@ def __init__(
                                         bias=False,
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+            self.use_normal_rope = False
+        else:
+            self.use_normal_rope = True
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
@@ -306,7 +310,18 @@ def forward(
                      self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        if self.use_normal_rope:
+            seq_len = positions.size(0)
+            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+            q_pe = q_pe.reshape(seq_len, -1)
+            k_pe = k_pe.reshape(seq_len, -1)
+
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        if self.use_normal_rope:
+            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
         q[..., self.qk_nope_head_dim:] = q_pe
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
@@ -583,7 +598,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
 
             # TODO(simon): support nextn predict layers
-            if self.config.num_nextn_predict_layers > 0:
+            if hasattr(self.config, "num_nextn_predict_layers"
+                       ) and self.config.num_nextn_predict_layers > 0:
                 assert self.config.num_nextn_predict_layers == 1
                 layer_idx = self.config.num_hidden_layers
                 if name.startswith(f"model.layers.{layer_idx}"):
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
new file mode 100644
index 0000000000000..99fa941c055d2
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -0,0 +1,662 @@
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
+"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
+import math
+from functools import cached_property, partial
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import AutoProcessor, BatchFeature, ProcessorMixin
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
+                                                          MlpProjectorConfig,
+                                                          VisionEncoderConfig)
+from vllm.utils import is_list_of
+
+from .interfaces import SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# The image token id may be various
+_IMAGE_TOKEN = "<image>"
+
+
+class DeepseekVL2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+    """
+    images_spatial_crop: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+    """
+
+
+class DeepseekVL2VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
+                               DeepseekVL2VImageEmbeddingInputs]
+
+
+class MlpProjector(nn.Module):
+
+    def __init__(self, cfg: MlpProjectorConfig):
+
+        super().__init__()
+
+        self.cfg = cfg
+        assert not cfg.token_pooling, (
+            "Token pooling is not supported currently.")
+
+        if cfg.projector_type == "downsample_mlp_gelu":
+            mlp_depth = cfg.depth
+            mlp_ratio = cfg.mlp_ratio
+            modules = [
+                nn.Linear(
+                    cfg.input_dim * cfg.downsample_ratio *
+                    cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
+            ]
+            for _ in range(1, mlp_depth - 1):
+                modules.append(nn.GELU())
+                modules.append(
+                    nn.Linear(cfg.n_embed * mlp_ratio,
+                              cfg.n_embed * mlp_ratio))
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported projector type: {cfg.projector_type}")
+
+        self.layers = modules
+
+    def forward(self, x):
+        bs, hw, input_dim = x.shape
+        h = w = int((hw)**0.5)
+        """compute padding"""
+        if h % self.cfg.downsample_ratio:
+            pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
+        else:
+            pad = 0
+        x = x.reshape(bs, h, w, input_dim)
+        if pad > 0:
+            x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+        """4 to 1 concat"""
+        x = x.permute(0, 3, 1, 2)  # B, C, H, W
+        x = F.unfold(x,
+                     kernel_size=self.cfg.downsample_ratio,
+                     stride=self.cfg.downsample_ratio,
+                     padding=0)  # B, C*4, HW // 4
+        x = x.permute(0, 2, 1)
+
+        return self.layers(x)
+
+
+class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(DeepseekVLV2Config)
+
+    def get_hf_processor(self) -> ProcessorMixin:
+        # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2
+        # in the future, because it's flasky and lack of maintenance.
+        try:
+            from deepseek_vl2.models.processing_deepseek_vl_v2 import (
+                DeepseekVLV2Processor, select_best_resolution)
+            AutoProcessor.register("DeepseekVLV2Processor",
+                                   DeepseekVLV2Processor)
+        except ModuleNotFoundError as exc:
+            raise ModuleNotFoundError(
+                "You need to `pip install "
+                "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` "
+                "to use this model") from exc
+
+        processor = self.ctx.get_hf_processor(DeepseekVLV2Processor)
+        processor.select_best_resolution = partial(
+            select_best_resolution,
+            candidate_resolutions=processor.candidate_resolutions)
+        return processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(self, *, image_width: int,
+                             image_height: int) -> int:
+        hf_processor = self.get_hf_processor()
+        image_size = hf_processor.image_size
+        patch_size = hf_processor.patch_size
+        downsample_ratio = hf_processor.downsample_ratio
+
+        best_width, best_height = hf_processor.select_best_resolution(
+            (image_width, image_height))
+
+        num_width_tiles, num_height_tiles = (best_width // image_size,
+                                             best_height // image_size)
+        h = w = math.ceil((image_size // patch_size) / downsample_ratio)
+
+        global_views_tokens = h * (w + 1)
+        local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1)
+        return global_views_tokens + local_views_tokens + 1
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        candidate_resolutions = hf_config.candidate_resolutions
+        height, width = max(candidate_resolutions,
+                            key=lambda x: self.get_num_image_tokens(
+                                image_width=x[1], image_height=x[0]))
+        return ImageSize(width=width, height=height)
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_size = self.get_image_size_with_most_features()
+        max_image_tokens = self.get_num_image_tokens(
+            image_height=max_image_size.height,
+            image_width=max_image_size.width)
+
+        return {"image": max_image_tokens}
+
+
+class DeepseekVL2DummyInputsBuilder(
+        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        max_image_size = self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size.width,
+                                   height=max_image_size.height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class DeepseekVL2MultiModalProcessor(
+        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(prompt=prompt, **mm_data),
+                mm_kwargs,
+            )
+
+            # Deepseek-vl2 processor don't return BatchFeature,
+            # we need to manually create it
+            processed_outputs = dict(input_ids=outputs["input_ids"])
+            processed_outputs = BatchFeature(data=dict(processed_outputs),
+                                             tensor_type="pt")
+
+            # Remove batch dimension from processor outputs,
+            # because we will try batch to create NestedTensors
+            target_dtype = self.info.ctx.model_config.dtype
+            pixel_values = outputs["images"].to(target_dtype).squeeze(0)
+            images_spatial_crop = outputs["images_spatial_crop"].squeeze(0)
+            patches_per_image = [
+                x.prod().item() + 1 for x in images_spatial_crop
+            ]
+
+            # Rename `images` -> `pixel_values` to avoid confusion
+            processed_outputs["pixel_values"] = list(
+                pixel_values.split(patches_per_image))
+            processed_outputs["images_spatial_crop"] = images_spatial_crop
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            images_spatial_crop=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor()
+        image_token_id: int = hf_processor.image_token_id
+
+        def get_replacement_deepseek_vl2(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_deepseek_vl2,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    DeepseekVL2MultiModalProcessor,
+    info=DeepseekVL2ProcessingInfo,
+    dummy_inputs=DeepseekVL2DummyInputsBuilder)
+class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "language.": "language_model.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: DeepseekVLV2Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_config = config.vision_config
+        self.projector_config = config.projector_config
+        self.text_config = config.text_config
+
+        model_config = vllm_config.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            tokenizer_mode=model_config.tokenizer_mode,
+            tokenizer_revision=model_config.tokenizer_revision,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
+
+        self.vision = self._init_vision_module(self.vision_config,
+                                               quant_config,
+                                               maybe_prefix(prefix, "vision"))
+
+        self.projector = MlpProjector(self.projector_config)
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        # special token for image token sequence format
+        embed_std = 1 / torch.sqrt(
+            torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
+        if self.tile_tag == "2D":
+            # <|view_separator|>, <|\n|>
+            self.image_newline = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+            # This is a typo in original implementation
+            self.view_seperator = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+        else:
+            raise ValueError(
+                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
+            )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=self.text_config,
+            prefix=maybe_prefix(prefix, "language"),
+            architectures=["DeepseekV3ForCausalLM"]
+            if self.text_config.topk_method == "noaux_tc" else
+            ["DeepseekV2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _init_vision_module(
+        self,
+        vision_config: VisionEncoderConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm") from ImportError
+
+        with set_default_torch_dtype(torch.float16):
+            model = timm.create_model(
+                "vit_so400m_patch14_siglip_384.webli",
+                pretrained=False,
+                num_classes=0,
+                dynamic_img_size=True,
+                dynamic_img_pad=True,
+            )
+
+        model = model.to(dtype=torch.get_default_dtype())
+        return model
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_images_spatial_crop(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        expected_dims = 2
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = d.size(-1)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(images_spatial_crop)}")
+
+            return DeepseekVL2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                images_spatial_crop=self._validate_images_spatial_crop(
+                    flatten_bn(images_spatial_crop, concat=True)))
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return DeepseekVL2VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _pixel_values_to_embedding(
+        self,
+        pixel_values: NestedTensors,
+        images_spatial_crop: torch.Tensor,
+    ) -> NestedTensors:
+        # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width]
+        total_tiles = [x for x in pixel_values]
+
+        # [batch_all_tiles, 3, height, width]
+        total_tiles = torch.cat(total_tiles, dim=0)
+
+        # [batch_all_tiles, vit_seq_len, c]
+        images_feature = self.vision.forward_features(total_tiles)
+
+        # [batch_all_tiles, hw, D]
+        images_embeds = self.projector(images_feature)
+
+        _, hw, n_dim = images_embeds.shape
+        h = w = int(hw**0.5)
+
+        # 根据self.tile_tag & self.global_view_pos填充image token sequence
+        tile_index = 0
+        vision_embeddings = []
+        for jdx in range(images_spatial_crop.size(0)):
+            # extra global & local features
+            num_width_tiles, num_height_tiles = images_spatial_crop[jdx]
+            if num_width_tiles == 0 or num_height_tiles == 0:
+                break
+            num_tiles_in_image = num_width_tiles * num_height_tiles
+
+            # [hw, D]
+            global_features = images_embeds[tile_index]
+
+            # [num_height_tiles * num_width_tiles, hw, D]
+            local_features = images_embeds[tile_index + 1:tile_index + 1 +
+                                           num_tiles_in_image]
+            tile_index += num_tiles_in_image + 1
+
+            # format global and local features
+            # ----------------- global view add newline -----------------
+            # [hw, D] -> [h, w, D]
+            global_features = global_features.view(h, w, n_dim)
+
+            # [D]     -> [h, 1, D]
+            new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
+
+            # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
+            global_features = torch.cat([global_features, new_lines_in_global],
+                                        dim=1)
+
+            # [h, w + 1, D] -> [h * (w + 1), D]
+            global_features = global_features.view(-1, n_dim)
+
+            # ----------------- local view add newline -----------------
+            # [num_height_tiles * num_width_tiles, h * w, D] ->
+            # [num_height_tiles * h, num_width_tiles * w, D]
+            local_features = rearrange(local_features,
+                                       "(th tw) (h w) d -> (th h) (tw w) d",
+                                       th=num_height_tiles,
+                                       tw=num_width_tiles,
+                                       h=h,
+                                       w=w)
+
+            # [D] -> [num_height_tiles * h, 1, D]
+            new_lines_in_local = repeat(self.image_newline,
+                                        "d -> (th h) 1 d",
+                                        th=num_height_tiles,
+                                        h=h)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            local_features = torch.cat([local_features, new_lines_in_local],
+                                       dim=1)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
+            local_features = local_features.view(-1, n_dim)
+
+            # merge global and local tiles
+            if self.global_view_pos == "head":
+                global_local_features = torch.cat([
+                    global_features,
+                    self.view_seperator[None, :],
+                    local_features,
+                ])
+            else:
+                global_local_features = torch.cat([
+                    local_features,
+                    self.view_seperator[None, :],
+                    global_features,
+                ])
+
+            vision_embeddings.append(global_local_features)
+        return vision_embeddings
+
+    def _process_image_input(
+            self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            image_data = image_input["data"]
+            if is_list_of(image_data, torch.Tensor):
+                # it's already a list of tensors
+                return image_data
+            if len(image_data.shape) == 3:
+                # 3D tensor
+                return list(torch.unbind(image_data, dim=0))
+            raise ValueError(
+                "We expect batched 2D tensors;"
+                "this can be either a list of 2D tensors or a single 3D tensor."
+            )
+
+        pixel_values = image_input["data"]
+        images_spatial_crop = image_input["images_spatial_crop"]
+
+        return self._pixel_values_to_embedding(
+            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
+
+    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object):
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            intermediate_tensors,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 8f36437d47d9e..ff7dab89e4da8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -657,7 +657,7 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        # TODO: refactor this vision model
+        # TODO: refactor vision model through timm wrapper from transformers
         try:
             import timm
         except ImportError:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 62840b8c1bcda..a7286a9203f67 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -149,6 +149,7 @@
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
+    "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 58417980e7b47..c97acffa1a719 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -23,8 +23,9 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, EAGLEConfig,
-                                             ExaoneConfig, H2OVLChatConfig,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -54,6 +55,7 @@
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
+    "deepseek_vl_v2": DeepseekVLV2Config,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index a41a35c88b3a1..f065c56124605 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,6 +1,7 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
@@ -25,6 +26,7 @@
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
+    "DeepseekVLV2Config",
     "MPTConfig",
     "RWConfig",
     "H2OVLChatConfig",
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
new file mode 100644
index 0000000000000..681528c3c0116
--- /dev/null
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -0,0 +1,214 @@
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
+from typing import Tuple
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(self,
+                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
+                 image_size: int = 384,
+                 patch_size: int = 16,
+                 width: int = 1024,
+                 layers: int = 24,
+                 heads: int = 16,
+                 mlp_ratio: int = 4,
+                 global_pool: str = "map",
+                 ignore_head: bool = True,
+                 class_token: bool = False,
+                 num_classes: int = 0,
+                 use_checkpoint: bool = False,
+                 **kwargs):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(self,
+                 projector_type: str = "downsample_mlp_gelu",
+                 input_dim: int = 1152,
+                 n_embed: int = 2048,
+                 depth: int = 2,
+                 mlp_ratio: int = 1,
+                 downsample_ratio: int = 2,
+                 **kwargs):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method='gready',
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func='softmax',
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVLV2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), )
+
+    def __init__(self,
+                 tile_tag: str = "tile_tag",
+                 global_view_pos: str = "head",
+                 candidate_resolutions: Tuple[Tuple[int,
+                                                    int]] = ((384, 384), ),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size

From 8bddb735123204872788a8ffe117321de7550e6c Mon Sep 17 00:00:00 2001
From: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Date: Sun, 12 Jan 2025 13:01:52 +0000
Subject: [PATCH 093/115] [Hardware][CPU] Multi-LoRA implementation for the CPU
 backend (#11100)

Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/run-cpu-test.sh                    |   6 +
 docs/source/features/compatibility_matrix.md  |   2 +-
 tests/lora/conftest.py                        |  32 +-
 tests/lora/test_layers.py                     |  39 +-
 tests/lora/test_lora_manager.py               |  21 +-
 tests/lora/test_mixtral.py                    |   4 +-
 ...nica_sizes.py => test_punica_ops_sizes.py} | 124 ++++---
 ...iation.py => test_punica_ops_variation.py} | 134 ++++---
 tests/lora/test_quant_model.py                |   3 +-
 tests/lora/utils.py                           |  27 --
 vllm/executor/cpu_executor.py                 |   3 -
 vllm/lora/ops/torch_ops/__init__.py           |  13 +
 vllm/lora/ops/torch_ops/lora_ops.py           | 113 ++++++
 vllm/lora/ops/triton_ops/__init__.py          |  13 +
 vllm/lora/ops/{ => triton_ops}/bgmv_expand.py |   0
 .../ops/{ => triton_ops}/bgmv_expand_slice.py |   0
 vllm/lora/ops/{ => triton_ops}/bgmv_shrink.py |   0
 vllm/lora/ops/{ => triton_ops}/sgmv_expand.py |   0
 vllm/lora/ops/{ => triton_ops}/sgmv_shrink.py |   0
 vllm/lora/ops/{ => triton_ops}/utils.py       |   0
 vllm/lora/punica_wrapper/punica_cpu.py        | 346 ++++++++++++++++++
 vllm/lora/punica_wrapper/punica_gpu.py        |  10 +-
 vllm/lora/punica_wrapper/punica_selector.py   |   5 +
 vllm/worker/cpu_model_runner.py               | 133 ++++++-
 vllm/worker/cpu_worker.py                     |  20 +-
 25 files changed, 855 insertions(+), 193 deletions(-)
 rename tests/lora/{test_punica_sizes.py => test_punica_ops_sizes.py} (77%)
 rename tests/lora/{test_punica_variation.py => test_punica_ops_variation.py} (74%)
 create mode 100644 vllm/lora/ops/torch_ops/__init__.py
 create mode 100644 vllm/lora/ops/torch_ops/lora_ops.py
 create mode 100644 vllm/lora/ops/triton_ops/__init__.py
 rename vllm/lora/ops/{ => triton_ops}/bgmv_expand.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/bgmv_expand_slice.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/bgmv_shrink.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/sgmv_expand.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/sgmv_shrink.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/utils.py (100%)
 create mode 100644 vllm/lora/punica_wrapper/punica_cpu.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 4ae66f6f3215a..9925db7bea593 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -75,6 +75,12 @@ function cpu_tests() {
       --num-prompts 20 \
       --endpoint /v1/completions \
       --tokenizer facebook/opt-125m"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/lora/test_qwen2vl.py"
 }
 
 # All of CPU tests are expected to be finished less than 25 mins.
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 8d8f7dca2e5b5..ea1d545ff3d73 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -359,7 +359,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](gh-pr:4830)
+     - ✅
      - ✅
    * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
      - ✅
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 57ebaa424fc59..e7378d00765f0 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from vllm.platforms import current_platform
 
 
 class ContextIDInfo(TypedDict):
@@ -65,13 +66,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
     temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
+
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
+
+    init_distributed_environment(world_size=1,
+                                 rank=0,
+                                 distributed_init_method=f"file://{temp_file}",
+                                 local_rank=0,
+                                 backend=backend)
     initialize_model_parallel(1, 1)
     yield
     cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -81,13 +85,15 @@ def dist_init():
 def dist_init_torch_only():
     if torch.distributed.is_initialized():
         return
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
+
     temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(
-        backend="nccl",
-        world_size=1,
-        rank=0,
-        init_method=f"file://{temp_file}",
-    )
+    torch.distributed.init_process_group(world_size=1,
+                                         rank=0,
+                                         init_method=f"file://{temp_file}",
+                                         backend=backend)
 
 
 @pytest.fixture
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index fb8c0b2a7ba26..08a589d7ee29c 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -48,10 +48,14 @@
     torch.float32: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-# TODO: Modify this based on platform
-DEVICES = [
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
+    reason="Backend not supported")
+
+DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
 
 #For GPU, we will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
@@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
 
         return type(punica_wrapper) is PunicaWrapperGPU
+    elif current_platform.is_cpu():
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+
+        return type(punica_wrapper) is PunicaWrapperCPU
     else:
         return False
 
@@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
     # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
     # device, see: https://github.com/triton-lang/triton/issues/2925
     # Same below.
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -313,7 +322,9 @@ def create_random_embedding_layer():
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
                                         vocab_size, stage) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -450,7 +461,9 @@ def create_random_embedding_layer():
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
                                   stage) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -582,7 +595,9 @@ def _pretest():
 def test_linear_replicated(dist_init, num_loras, device, stage,
                            bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -695,7 +710,9 @@ def create_random_linear_replicated_layer():
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
                          device, stage, bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -818,7 +835,9 @@ def create_random_linear_parallel_layer():
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                 device, stage, bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -971,6 +990,8 @@ class FakeConfig:
 @pytest.mark.parametrize("rotary_dim", [None, 32])
 @pytest.mark.parametrize("head_size", [32, 108])
 @pytest.mark.parametrize("seq_len", [11, 1024])
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only CUDA backends are supported")
 def test_rotary_embedding_long_context(dist_init, num_loras, device,
                                        scaling_factors, max_position,
                                        is_neox_style, rotary_dim, head_size,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index a099f36b0a465..ca523c66abe42 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -20,6 +20,7 @@
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.platforms import current_platform
 
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
@@ -28,9 +29,9 @@
 
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
-CUDA_DEVICES = [
+DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
 
 
 def test_peft_helper(sql_lora_files):
@@ -83,7 +84,7 @@ def test_peft_helper(sql_lora_files):
         PEFTHelper.from_dict(config)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
@@ -171,7 +172,7 @@ def test_replace_submodules(dist_init, dummy_model):
     manager = LoRAModelManager(
         model, 1, 1, 1,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device("cuda"))
+        torch.device(DEVICES[0]))
     model = manager.model
 
     assert isinstance(model.get_submodule("dense1"),
@@ -183,7 +184,7 @@ def test_replace_submodules(dist_init, dummy_model):
                       RowParallelLinearWithLoRA)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -244,7 +245,7 @@ def test_lora_model_manager(dist_init, dummy_model, device):
     assert manager.punica_wrapper.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -336,7 +337,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
@@ -466,7 +467,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
@@ -545,7 +546,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
@@ -621,7 +622,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_packed_loras(dist_init, dummy_model_gate_up, device):
     model = dummy_model_gate_up
     model.supported_lora_modules = ["gate_up_proj"]
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 797a495201d33..940a865228806 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -5,6 +5,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
@@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
     """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count() < tp_size:
+    if torch.cuda.device_count(
+    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     prompts = [
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_ops_sizes.py
similarity index 77%
rename from tests/lora/test_punica_sizes.py
rename to tests/lora/test_punica_ops_sizes.py
index 0351fedd1cfa5..433ca7577d084 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
@@ -9,17 +9,16 @@
 import pytest
 import torch
 
-from vllm.lora.ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
 from .utils import (assert_close, generate_data,
                     generate_data_for_expand_nslices,
-                    generate_data_for_nslices, ref_torch_groupgemm)
+                    generate_data_for_nslices)
 
 HIDDEN_SIZES = [
     128,
@@ -113,7 +112,7 @@
 MAX_RANKS = [32]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
+DEVICES = [f"cuda:{0}"]
 
 _dict_lock = Lock()
 
@@ -127,7 +126,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
     batches: int,
     num_loras: int,
@@ -174,7 +173,7 @@ def test_punica_sgmv(
         # Preventing cache error pointer.
         with _dict_lock:
             _LORA_A_PTR_DICT.clear()
-            sgmv_shrink(
+            torch.ops.vllm.sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -187,20 +186,23 @@ def test_punica_sgmv(
                 scaling,
             )
         for index in range(nslices):
-            ref_torch_groupgemm(
-                ref_out_tensor[index],
+            sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst[index],
-                lora_indices_tensor,
+                ref_out_tensor[index],
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
+                max_seq_length,
+                token_nums,
                 scaling,
-                op_type,
             )
+
     else:
         with _dict_lock:
             _LORA_B_PTR_DICT.clear()
-            sgmv_expand(
+            torch.ops.vllm.sgmv_expand(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -213,21 +215,39 @@ def test_punica_sgmv(
                 offset_start=0,
                 add_inputs=True,
             )
-
-        slice_offset = 0
-        for index in range(nslices):
-            lora_weights = lora_weights_lst[index]
-            ref_torch_groupgemm(
-                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
-                inputs_tensor[index],
-                lora_weights,
-                lora_indices_tensor,
+        if nslices == 1:
+            # Verify the torch's sgmv_expand op
+            sgmv_expand(
+                inputs_tensor[0],
+                lora_weights_lst[0],
+                ref_out_tensor,
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
-                1.0,
-                op_type,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
             )
-            slice_offset += hidden_size
+        else:
+            slice_offset = 0
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
 
     assert_close(our_out_tensor, ref_out_tensor)
 
@@ -240,7 +260,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
     batches: int,
     num_loras: int,
@@ -276,31 +296,38 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             scaling,
         )
+
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
+
     else:
-        bgmv_expand(
+        torch.ops.vllm.bgmv_expand(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             add_inputs=True,
         )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            add_inputs=True,
+        )
+
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
@@ -313,7 +340,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
@@ -350,7 +377,7 @@ def test_punica_bgmv_expand_nslices(
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        bgmv_expand_slice(
+        torch.ops.vllm.bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
             our_outputs,
@@ -359,15 +386,14 @@ def test_punica_bgmv_expand_nslices(
             slice_size=hidden_size,
             add_inputs=True,
         )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+        bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
-            lora_indices_tensor,
-            seq_len_tensor,
-            batches,
-            1.0,
-            op_type="expand",
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
         )
 
         slice_offset += hidden_size
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_ops_variation.py
similarity index 74%
rename from tests/lora/test_punica_variation.py
rename to tests/lora/test_punica_ops_variation.py
index 9ee10e7c23ee6..2bb84c1cf11e9 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_ops_variation.py
@@ -9,19 +9,18 @@
 import torch
 
 # Enable custom op register
-import vllm.lora.ops.bgmv_expand
-import vllm.lora.ops.bgmv_expand_slice
-import vllm.lora.ops.bgmv_shrink
-import vllm.lora.ops.sgmv_expand
-import vllm.lora.ops.sgmv_shrink  # noqa: F401
-from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
 from .utils import (assert_close, generate_data,
                     generate_data_for_expand_nslices,
-                    generate_data_for_nslices, ref_torch_groupgemm)
+                    generate_data_for_nslices)
 
-HIDDEN_SIZES = [4097]
+HIDDEN_SIZES = [2049]
 
 BATCHES = [1, 4, 16, 32]
 NUM_LORA = [1, 8, 32, 128]
@@ -29,15 +28,7 @@
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
-
-# Unlike test_punica_sizes.py, we directly utilize custom op for
-# testing, which verifies the correct registration of these ops.
-bgmv_expand = torch.ops.vllm.bgmv_expand
-bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-bgmv_shrink = torch.ops.vllm.bgmv_shrink
-sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_shrink = torch.ops.vllm.sgmv_shrink
+DEVICES = [f"cuda:{0}"]
 
 _dict_lock = Lock()
 
@@ -51,7 +42,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
     batches: int,
     num_loras: int,
@@ -98,7 +89,7 @@ def test_punica_sgmv(
         # Preventing cache error pointer.
         with _dict_lock:
             _LORA_A_PTR_DICT.clear()
-            sgmv_shrink(
+            torch.ops.vllm.sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -111,20 +102,23 @@ def test_punica_sgmv(
                 scaling,
             )
         for index in range(nslices):
-            ref_torch_groupgemm(
-                ref_out_tensor[index],
+            sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst[index],
-                lora_indices_tensor,
+                ref_out_tensor[index],
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
+                max_seq_length,
+                token_nums,
                 scaling,
-                op_type,
             )
+
     else:
         with _dict_lock:
             _LORA_B_PTR_DICT.clear()
-            sgmv_expand(
+            torch.ops.vllm.sgmv_expand(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -137,21 +131,39 @@ def test_punica_sgmv(
                 offset_start=0,
                 add_inputs=True,
             )
-
         slice_offset = 0
-        for index in range(nslices):
-            lora_weights = lora_weights_lst[index]
-            ref_torch_groupgemm(
-                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
-                inputs_tensor[index],
-                lora_weights,
-                lora_indices_tensor,
+        if nslices == 1:
+            # Verify the torch's sgmv_expand op
+            sgmv_expand(
+                inputs_tensor[0],
+                lora_weights_lst[0],
+                ref_out_tensor,
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
-                1.0,
-                op_type,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
             )
-            slice_offset += hidden_size
+        else:
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
 
     assert_close(our_out_tensor, ref_out_tensor)
 
@@ -164,7 +176,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
     batches: int,
     num_loras: int,
@@ -176,7 +188,6 @@ def test_punica_bgmv(
     seed: int,
     device: str,
 ):
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
@@ -201,32 +212,38 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             scaling,
         )
-    else:
 
-        bgmv_expand(
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
+
+    else:
+        torch.ops.vllm.bgmv_expand(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             add_inputs=True,
         )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            add_inputs=True,
+        )
+
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
@@ -239,7 +256,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
@@ -276,7 +293,7 @@ def test_punica_bgmv_expand_nslices(
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        bgmv_expand_slice(
+        torch.ops.vllm.bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
             our_outputs,
@@ -285,15 +302,14 @@ def test_punica_bgmv_expand_nslices(
             slice_size=hidden_size,
             add_inputs=True,
         )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+        bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
-            lora_indices_tensor,
-            seq_len_tensor,
-            batches,
-            1.0,
-            op_type="expand",
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
         )
 
         slice_offset += hidden_size
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 026269667b473..26bf770cc0d4a 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -72,7 +72,8 @@ def format_prompt_tuples(prompt):
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
                           tp_size):
-    if num_gpus_available < tp_size:
+    if num_gpus_available < tp_size and \
+        tp_size > 1 and current_platform.is_cuda_alike():
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     llm = vllm.LLM(
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b66d18074a7bf..ce47546f2154b 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -104,33 +104,6 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-def ref_torch_groupgemm(
-    out_tensor,
-    inputs,
-    lora_weights,
-    lora_indices_tensor,
-    seq_len_tensor,
-    batches,
-    scaling,
-    op_type,
-) -> torch.Tensor:
-    out_list = []
-    current_offset = 0
-    for lora_index, b_length in zip(range(batches), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        result *= scaling
-        out_list.append(result)
-    cat_result = torch.cat(out_list, dim=0)
-    if op_type == "expand":
-        out_tensor += cat_result
-    else:
-        out_tensor.copy_(cat_result)
-    return
-
-
 def generate_data(
     batches,
     hidden_size,
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index c7f018d9a203e..b9a6bee5720fd 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,9 +22,6 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
         #
         # Environment variables for CPU executor
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
new file mode 100644
index 0000000000000..9c9159b95f308
--- /dev/null
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -0,0 +1,13 @@
+from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
+from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
+                                              sgmv_expand, sgmv_expand_slice,
+                                              sgmv_shrink)
+
+__all__ = [
+    "bgmv_expand",
+    "bgmv_expand_slice",
+    "bgmv_shrink",
+    "sgmv_expand",
+    "sgmv_expand_slice",
+    "sgmv_shrink",
+]
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
new file mode 100644
index 0000000000000..5f5aafd516159
--- /dev/null
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -0,0 +1,113 @@
+import torch
+
+
+def sgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                b_seq_start_loc: torch.Tensor,
+                seq_len_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                batches: int,
+                max_seq_length: int,
+                token_nums: int,
+                add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
+                add_inputs)
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    if add_inputs:
+        output_tensor[:, :outputs.shape[1]] += outputs[:limit, :]
+    else:
+        output_tensor[:, :outputs.shape[1]] = outputs[:limit, :]
+
+
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    scaling: float,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
+                scaling)
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    output_tensor[:, :outputs.shape[1]] = scaling * outputs[:]
+
+
+def sgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      b_seq_start_loc: torch.Tensor,
+                      seq_len_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      batches: int,
+                      max_seq_length: int,
+                      token_nums: int,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
+                      slice_offset, slice_size, add_inputs)
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    if add_inputs:
+        output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:]
+    else:
+        output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:]
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
new file mode 100644
index 0000000000000..9805b6dd5038e
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -0,0 +1,13 @@
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink  # noqa: F401
+
+__all__ = [
+    "bgmv_expand",
+    "bgmv_expand_slice",
+    "bgmv_shrink",
+    "sgmv_expand",
+    "sgmv_shrink",
+]
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
similarity index 100%
rename from vllm/lora/ops/bgmv_expand.py
rename to vllm/lora/ops/triton_ops/bgmv_expand.py
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
similarity index 100%
rename from vllm/lora/ops/bgmv_expand_slice.py
rename to vllm/lora/ops/triton_ops/bgmv_expand_slice.py
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
similarity index 100%
rename from vllm/lora/ops/bgmv_shrink.py
rename to vllm/lora/ops/triton_ops/bgmv_shrink.py
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
similarity index 100%
rename from vllm/lora/ops/sgmv_expand.py
rename to vllm/lora/ops/triton_ops/sgmv_expand.py
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
similarity index 100%
rename from vllm/lora/ops/sgmv_shrink.py
rename to vllm/lora/ops/triton_ops/sgmv_shrink.py
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
similarity index 100%
rename from vllm/lora/ops/utils.py
rename to vllm/lora/ops/triton_ops/utils.py
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
new file mode 100644
index 0000000000000..b9ae3e07492c0
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -0,0 +1,346 @@
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+
+from .punica_base import PunicaWrapperBase
+
+
+# The platforms that are compatible with the PyTorch-native implementation can
+# inherit this class
+class PunicaWrapperCPU(PunicaWrapperBase):
+    """
+    PunicaWrapperCPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the pytorch punica ops.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+    def _shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def _shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def _expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_inputs,
+        )
+
+    def _expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
+
+    def _expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_inputs,
+        )
+
+    def _expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_inputs)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool = True,
+    ):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+
+        expand_slice_fun: Callable = (self._expand_slice_prefill
+                                      if self.is_prefill else
+                                      self._expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
+
+    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                      w_t_all: torch.Tensor, scale: float):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_inputs (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = offset_start
+        if lora_bias_stacked is not None:
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_inputs=True,
+                        **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 278f7b5a8e9f4..451f23e49f27c 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -12,11 +12,11 @@
 from vllm.triton_utils import HAS_TRITON
 
 if HAS_TRITON:
-    from vllm.lora.ops.bgmv_expand import bgmv_expand
-    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-    from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+    from vllm.lora.ops.triton_ops import bgmv_expand
+    from vllm.lora.ops.triton_ops import bgmv_expand_slice
+    from vllm.lora.ops.triton_ops import bgmv_shrink
+    from vllm.lora.ops.triton_ops import sgmv_expand
+    from vllm.lora.ops.triton_ops import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
 
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 9791d492d8e48..9f1606e672dea 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -12,6 +12,11 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
         logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_cpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+        logger.info_once("Using PunicaWrapperCPU.")
+        return PunicaWrapperCPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d99db4e0c6c40..303d9a15e9c3c 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -2,8 +2,8 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type,
+                    TypeVar, Union)
 
 import torch
 from torch import nn
@@ -12,10 +12,14 @@
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
@@ -49,6 +53,8 @@ class ModelInputForCPU(ModelRunnerInputBase):
     virtual_engine: Optional[int] = None
     seq_lens: Optional[List[int]] = None
     query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -57,6 +63,8 @@ def as_broadcastable_tensor_dict(
             "input_positions": self.input_positions,
             "token_type_ids": self.token_type_ids,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
 
@@ -143,7 +151,11 @@ def __init__(self,
                                 or runner.cache_config.enable_prefix_caching)
         self.model_input_cls = self.runner._model_input_cls
         self.attn_backend = self.runner.attn_backend
+        self.sliding_window = self.runner.sliding_window
+        self.block_size = self.runner.block_size
+        self.device = self.runner.device
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
+        self.enable_lora = self.runner.lora_config is not None
         self.input_data = ModelInputForCPUBuilder.ModelInputData(
             self.runner.model_config.uses_mrope)
         self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()(
@@ -183,15 +195,28 @@ def build(self) -> ModelInputForCPU:
         attn_metadata = self.att_metadata_builder.build(
             input_data.seq_lens, input_data.query_lens, -1, -1)
 
-        return self.model_input_cls(
-            input_tokens=input_tokens,
-            input_positions=input_positions,
-            token_type_ids=token_type_ids,
-            seq_lens=input_data.seq_lens,
-            query_lens=input_data.query_lens,
-            attn_metadata=attn_metadata,
-            multi_modal_kwargs=multi_modal_kwargs,
-        )
+        is_prompt = (self.seq_group_metadata_list[0].is_prompt
+                     if self.seq_group_metadata_list else None)
+        # LoRA data.
+        lora_requests = set()
+        lora_mapping = None
+        if self.enable_lora:
+            lora_requests = set(seq.lora_request
+                                for seq in self.seq_group_metadata_list
+                                if seq.lora_request is not None)
+
+            lora_mapping = self._prepare_lora_input(
+                self.seq_group_metadata_list, is_prompt)
+
+        return self.model_input_cls(input_tokens=input_tokens,
+                                    input_positions=input_positions,
+                                    token_type_ids=token_type_ids,
+                                    seq_lens=input_data.seq_lens,
+                                    query_lens=input_data.query_lens,
+                                    attn_metadata=attn_metadata,
+                                    multi_modal_kwargs=multi_modal_kwargs,
+                                    lora_mapping=lora_mapping,
+                                    lora_requests=lora_requests)
 
     def _build_input_data(self):
         for seq_group_metadata in self.seq_group_metadata_list:
@@ -381,6 +406,24 @@ def _compute_multi_modal_input(self,
             self.input_data.multi_modal_placeholder_maps[modality].extend(
                 placeholder_map)
 
+    def _prepare_lora_input(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata],
+            is_prefill: bool) -> LoRAMapping:
+        index_mapping = []
+        prompt_mapping = []
+        for seq in seq_group_metadata_list:
+            lora_id = seq.lora_int_id
+            query_len = seq.token_chunk_size
+
+            index_mapping += [lora_id] * query_len
+            prompt_mapping += [lora_id] * (
+                query_len if seq.sampling_params
+                and seq.sampling_params.prompt_logprobs is not None else 1)
+
+        return LoRAMapping(index_mapping=tuple(index_mapping),
+                           prompt_mapping=tuple(prompt_mapping),
+                           is_prefill=is_prefill)
+
 
 class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
     """
@@ -431,10 +474,41 @@ def __init__(
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
+        # Set after load_model.
+        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
 
     def load_model(self) -> None:
         self.model = get_model(vllm_config=self.vllm_config)
 
+        if self.lora_config:
+            assert supports_lora(
+                self.model
+            ), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+            if supports_multimodal(self.model):
+                logger.warning("Regarding multimodal models, vLLM currently "
+                               "only supports adding LoRA to language model.")
+
+            # It's necessary to distinguish between the max_position_embeddings
+            # of VLMs and LLMs.
+            if hasattr(self.model.config, "max_position_embeddings"):
+                max_pos_embeddings = self.model.config.max_position_embeddings
+            else:
+                max_pos_embeddings = (
+                    self.model.config.text_config.max_position_embeddings)
+
+            self.lora_manager = LRUCacheWorkerLoRAManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens,
+                self.vocab_size,
+                self.lora_config,
+                self.device,
+                self.model.embedding_modules,
+                self.model.embedding_padding_modules,
+                max_position_embeddings=max_pos_embeddings,
+            )
+            self.model = self.lora_manager.create_lora_manager(self.model)
+
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -459,6 +533,37 @@ def sampler(self):
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+    def remove_all_loras(self):
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.remove_all_adapters()
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
+
 
 class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
@@ -515,6 +620,12 @@ def execute_model(
             raise ValueError(
                 "CPU worker does not support multi-step execution.")
 
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
         model_executable = self.model
 
         multimodal_kwargs = {}
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 494c6506f3c0f..3e5fcf11b9e16 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
 import torch.distributed
@@ -11,14 +11,14 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerBase,
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 logger = init_logger(__name__)
@@ -111,7 +111,7 @@ def get_cache_block_size(
         return dtype_size * total
 
 
-class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class CPUWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a CPU socket.
 
     Each worker is associated with a single CPU socket. The worker is 
@@ -266,6 +266,18 @@ def initialize_cache(self, num_gpu_blocks: int,
         # Initialize the cache.
         self._init_cache_engine()
 
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
     def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
         """Raise errors if the num_cpu_blocks is invalid.
         """

From 263a870ee18bd6a90e25dbfa342be32c6b92c33e Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Sun, 12 Jan 2025 17:53:51 +0200
Subject: [PATCH 094/115] [Hardware][TPU] workaround fix for MoE on TPU
 (#11764)

---
 tests/kernels/test_moe.py                     |  7 +++
 vllm/model_executor/layers/fused_moe/layer.py |  3 +-
 .../layers/fused_moe/moe_torch_iterative.py   | 51 +++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_torch_iterative.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 8b23b62826053..7fa5de1984452 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -14,6 +14,8 @@
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
+    fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -46,6 +48,11 @@ def test_fused_moe(
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+    iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False)
+    torch.testing.assert_close(iterative_output,
+                               torch_output,
+                               atol=2e-2,
+                               rtol=0)
 
 
 @pytest.mark.parametrize("dtype",
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cf5db368926b4..3d822fc0c7f99 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -20,7 +20,8 @@
 else:
     fused_experts = None  # type: ignore
 if current_platform.is_tpu():
-    from .moe_pallas import fused_moe as fused_moe_pallas
+    # the iterative moe implementation is used until the moe_pallas is fixed
+    from .moe_torch_iterative import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
 logger = init_logger(__name__)
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
new file mode 100644
index 0000000000000..bcff55f4fdf16
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn.functional as F
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    dtype = hidden_states.dtype
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    final_hidden_states = None
+    for expert_idx in range(num_experts):
+        expert_w1 = w1[expert_idx]
+        expert_w2 = w2[expert_idx]
+        expert_mask = (selected_experts == expert_idx)
+        expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
+        x = F.linear(hidden_states, expert_w1)
+        gate = F.silu(x[:, :intermediate_size])
+        x = x[:, intermediate_size:] * gate
+        x = F.linear(x, expert_w2)
+        current_hidden_states = x * expert_weights
+        if final_hidden_states is None:
+            final_hidden_states = current_hidden_states
+        else:
+            final_hidden_states = final_hidden_states + current_hidden_states
+
+    return final_hidden_states.view(orig_shape)  # type: ignore

From 9597a095f2c02670b44f5973635ce4b9852e8eab Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 12 Jan 2025 16:02:02 -0500
Subject: [PATCH 095/115] [V1][Core][1/n] Logging and Metrics (#11962)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_engine_core.py        |  4 +-
 tests/v1/engine/test_engine_core_client.py |  4 +-
 vllm/v1/core/scheduler.py                  | 20 ++++++--
 vllm/v1/engine/__init__.py                 |  3 ++
 vllm/v1/engine/async_llm.py                | 26 ++++++++---
 vllm/v1/engine/core.py                     | 53 +++++++---------------
 vllm/v1/engine/core_client.py              | 48 ++++++++------------
 vllm/v1/engine/llm_engine.py               |  5 +-
 vllm/v1/metrics/__init__.py                |  0
 vllm/v1/metrics/loggers.py                 | 38 ++++++++++++++++
 vllm/v1/metrics/stats.py                   | 12 +++++
 11 files changed, 129 insertions(+), 84 deletions(-)
 create mode 100644 vllm/v1/metrics/__init__.py
 create mode 100644 vllm/v1/metrics/loggers.py
 create mode 100644 vllm/v1/metrics/stats.py

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5b1732036e807..cccfd305ac604 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -80,7 +80,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -170,7 +170,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 7eac16f2cf542..e2c728b22d481 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -43,7 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
 def loop_until_done(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = client.get_output()
+        engine_core_outputs = client.get_output().outputs
 
         if len(engine_core_outputs) == 0:
             break
@@ -61,7 +61,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
 async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = await client.get_output_async()
+        engine_core_outputs = await client.get_output_async().outputs
 
         if len(engine_core_outputs) == 0:
             break
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b26716f5c02e6..f04e529891287 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -394,12 +395,12 @@ def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
+    ) -> EngineCoreOutputs:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
+        outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -438,7 +439,7 @@ def update_from_output(
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
                     stop_reason=request.stop_reason)
-                engine_core_outputs.append(output)
+                outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
@@ -446,7 +447,10 @@ def update_from_output(
 
             new_running.append(request)
         self.running = new_running
-        return engine_core_outputs
+        return EngineCoreOutputs(
+            outputs=outputs,
+            scheduler_stats=self.make_stats(),
+        )
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
@@ -515,6 +519,12 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def make_stats(self) -> SchedulerStats:
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+        )
+
 
 @dataclass
 class NewRequestData:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3ce9db0e47eed..6d90c38c72cf5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -4,6 +4,8 @@
 
 import msgspec
 
+from vllm.v1.metrics.stats import SchedulerStats
+
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
@@ -56,6 +58,7 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
+    scheduler_stats: SchedulerStats
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5daae45dee85c..e0ceb59dffcbd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,7 +4,6 @@
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
@@ -22,6 +21,8 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.stats import SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,6 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
         log_requests: bool = True,
@@ -45,7 +45,10 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers = stat_loggers
+        self.stat_loggers: List[StatLoggerBase] = [
+            LoggingStatLogger(),
+            # TODO(rob): PrometheusStatLogger(),
+        ]
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -82,7 +85,6 @@ def __init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -94,7 +96,6 @@ def from_engine_args(
         engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -114,7 +115,6 @@ def from_engine_args(
             log_stats=not engine_args.disable_log_stats,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
-            stat_loggers=stat_loggers,
         )
 
     def shutdown(self):
@@ -254,7 +254,8 @@ async def _run_output_handler(self):
                 outputs = await self.engine_core.get_output_async()
 
                 # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
+                request_outputs, reqs_to_abort = self.detokenizer.step(
+                    outputs.outputs)
 
                 # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
@@ -262,6 +263,9 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
+                # 5) Log any stats.
+                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
+
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
             kill_process_tree(os.getpid())
@@ -278,6 +282,14 @@ async def abort(self, request_id: str) -> None:
         if request_id in self.rid_to_queue:
             del self.rid_to_queue[request_id]
 
+    async def _log_stats(self, scheduler_stats: SchedulerStats):
+        """Log stats to the stat loggers."""
+        if not self.log_stats:
+            return
+
+        for logger in self.stat_loggers:
+            logger.log(scheduler_stats=scheduler_stats)
+
     def encode(
         self,
         prompt: PromptType,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 975ce11fe8aff..e7f90d3c62142 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,9 +17,9 @@
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -28,9 +28,7 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_MS = 5000
-POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5
+POLLING_TIMEOUT_S = 2.5
 
 
 class EngineCore:
@@ -40,10 +38,8 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
-        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -62,8 +58,6 @@ def __init__(
                                    vllm_config.cache_config,
                                    vllm_config.lora_config)
 
-        self._last_logging_time = time.time()
-
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
 
@@ -114,11 +108,12 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
-    def step(self) -> List[EngineCoreOutput]:
+    def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
         if not self.scheduler.has_unfinished_requests():
-            return []
+            return EngineCoreOutputs(
+                outputs=[], scheduler_stats=self.scheduler.make_stats())
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
@@ -145,7 +140,9 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        super().__init__(vllm_config, executor_class)
+
+        self.log_stats = log_stats
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -153,7 +150,7 @@ def __init__(
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
         self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
-        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
+        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -217,8 +214,10 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
+                        # Break out the loop so we can log_stats in step().
+                        if self.log_stats:
+                            break
                     except BaseException:
                         raise
 
@@ -230,28 +229,9 @@ def run_busy_loop(self):
             # 3) Step the engine core.
             outputs = self.step()
 
-            # 4) Put EngineCoreOutputs into the output queue.
+            # 5) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
 
-            self._log_stats()
-
-    def _log_stats(self):
-        """Log basic stats every LOGGING_TIME_S"""
-
-        if not self.log_stats:
-            return
-
-        now = time.time()
-
-        if now - self._last_logging_time > LOGGING_TIME_S:
-            logger.info(
-                "RUNNING: %s | WAITING: %s",
-                len(self.scheduler.running),
-                len(self.scheduler.waiting),
-            )
-
-            self._last_logging_time = now
-
     def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
@@ -301,7 +281,6 @@ def process_output_socket(self, output_path: str):
 
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
-                engine_core_outputs = self.output_queue.get()
-                outputs = EngineCoreOutputs(outputs=engine_core_outputs)
+                outputs = self.output_queue.get()
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4ed7f944b058f..9d6ae725e9d2b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,9 +12,9 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
@@ -40,7 +40,6 @@ def make_client(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,18 +49,18 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return AsyncMPClient(vllm_config, executor_class)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class, log_stats)
+            return SyncMPClient(vllm_config, executor_class)
 
-        return InprocClient(vllm_config, executor_class, log_stats)
+        return InprocClient(vllm_config, executor_class)
 
     @abstractmethod
     def shutdown(self):
         ...
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -73,7 +72,7 @@ def profile(self, is_start: bool = True) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -99,7 +98,7 @@ class InprocClient(EngineCoreClient):
     def __init__(self, *args, **kwargs):
         self.engine_core = EngineCore(*args, **kwargs)
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         return self.engine_core.step()
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -133,7 +132,7 @@ def __init__(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
+        log_stats: bool,
     ):
         # The child processes will send SIGUSR1 when unrecoverable
         # errors happen. We kill the process tree here so that the
@@ -194,22 +193,19 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
+            log_stats=False,
         )
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
 
         (frame, ) = self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
-        return engine_core_outputs
+        return self.decoder.decode(frame.buffer)
 
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
@@ -235,23 +231,19 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
+            log_stats=True,
         )
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
 
         frames = await self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
-
-        return engine_core_outputs
+        return self.decoder.decode(frames[0].buffer)
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8ced3a34d2da3..ac392f5e4f4cf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -74,7 +74,6 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
         )
 
     @classmethod
@@ -147,11 +146,11 @@ def add_request(
     def step(self) -> List[RequestOutput]:
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        engine_core_outputs = self.engine_core.get_output()
+        outputs = self.engine_core.get_output()
 
         # 2) Detokenizer the EngineCoreOutput.
         request_outputs, requests_to_abort = self.detokenizer.step(
-            engine_core_outputs)
+            outputs.outputs)
 
         # 3) Abort requests that finished due to stopping criteria.
         if requests_to_abort:
diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
new file mode 100644
index 0000000000000..8feeef17542e6
--- /dev/null
+++ b/vllm/v1/metrics/loggers.py
@@ -0,0 +1,38 @@
+import time
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.metrics.stats import SchedulerStats
+
+logger = init_logger(__name__)
+
+_LOCAL_LOGGING_INTERVAL_SEC = 5.0
+
+
+class StatLoggerBase(ABC):
+
+    @abstractmethod
+    def log(self, scheduler_stats: SchedulerStats):
+        ...
+
+
+class LoggingStatLogger(StatLoggerBase):
+
+    def __init__(self):
+        self.last_log_time = time.monotonic()
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log Stats to standard output."""
+
+        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        now = time.monotonic()
+        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+            return
+        self.last_log_time = now
+
+        # Format and print output.
+        logger.info(
+            "Running: %d reqs, Waiting: %d reqs ",
+            scheduler_stats.num_running_reqs,
+            scheduler_stats.num_waiting_reqs,
+        )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
new file mode 100644
index 0000000000000..5ebb4fd5b37db
--- /dev/null
+++ b/vllm/v1/metrics/stats.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    num_running_reqs: int = 0
+    num_waiting_reqs: int = 0
+
+    # gpu_cache_usage: float = 0.0
+    # gpu_prefix_cache_hit_rate: float = 0.0

From d14e98d924724b284dc5eaf8070d935e214e50c0 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 13 Jan 2025 08:13:44 +0800
Subject: [PATCH 096/115] [Model] Support GGUF models newly added in
 `transformers` 4.46.0 (#9685)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/offline_inference/gguf_inference.py  |  22 ++--
 .../models/decoder_only/language/test_gguf.py | 105 ++++++++++++------
 vllm/model_executor/layers/linear.py          |  58 ++++++----
 vllm/model_executor/models/gpt2.py            |  11 +-
 vllm/model_executor/models/llama.py           |   3 +-
 vllm/model_executor/models/stablelm.py        |  31 ++++--
 vllm/model_executor/models/starcoder2.py      |  19 +++-
 7 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
index 09a5fcc22e553..aa05c4c0bfaa5 100644
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@@ -3,27 +3,20 @@
 from vllm import LLM, SamplingParams
 
 
-def run_gguf_inference(model_path):
-    PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"  # noqa: E501
-    system_message = "You are a friendly chatbot who always responds in the style of a pirate."  # noqa: E501
+def run_gguf_inference(model_path, tokenizer):
     # Sample prompts.
     prompts = [
         "How many helicopters can a human eat in one sitting?",
         "What's the future of AI?",
     ]
-    prompts = [
-        PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
-        for prompt in prompts
-    ]
+    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
     # Create a sampling params object.
     sampling_params = SamplingParams(temperature=0, max_tokens=128)
 
     # Create an LLM.
-    llm = LLM(model=model_path,
-              tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-              gpu_memory_utilization=0.95)
+    llm = LLM(model=model_path, tokenizer=tokenizer)
 
-    outputs = llm.generate(prompts, sampling_params)
+    outputs = llm.chat(prompts, sampling_params)
     # Print the outputs.
     for output in outputs:
         prompt = output.prompt
@@ -32,7 +25,8 @@ def run_gguf_inference(model_path):
 
 
 if __name__ == "__main__":
-    repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-    filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
+    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
+    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
+    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
     model = hf_hub_download(repo_id, filename=filename)
-    run_gguf_inference(model)
+    run_gguf_inference(model, tokenizer)
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 2b8f5e2faa45e..81b93ebdf0fc0 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -4,6 +4,7 @@
 """
 
 import os
+from typing import List, NamedTuple, Type
 
 import pytest
 from huggingface_hub import hf_hub_download
@@ -11,6 +12,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
+from ....conftest import VllmRunner
 from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -18,31 +20,74 @@
 MAX_MODEL_LEN = 1024
 
 
+class GGUFTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_filename: str
+
+    @property
+    def gguf_model(self):
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
+
+
+LLAMA_CONFIG = GGUFTestConfig(
+    original_model="meta-llama/Llama-3.2-1B-Instruct",
+    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
+    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+)
+
+QWEN2_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen2.5-1.5B-Instruct",
+    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
+)
+
+PHI3_CONFIG = GGUFTestConfig(
+    original_model="microsoft/Phi-3.5-mini-instruct",
+    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
+    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
+)
+
+GPT2_CONFIG = GGUFTestConfig(
+    original_model="openai-community/gpt2-large",
+    gguf_repo="QuantFactory/gpt2-large-GGUF",
+    gguf_filename="gpt2-large.Q4_K_M.gguf",
+)
+
+STABLELM_CONFIG = GGUFTestConfig(
+    original_model="stabilityai/stablelm-3b-4e1t",
+    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
+    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
+)
+
+STARCODER_CONFIG = GGUFTestConfig(
+    original_model="bigcode/starcoder2-3b",
+    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
+    gguf_filename="starcoder2-3b.Q6_K.gguf",
+)
+
+MODELS = [
+    LLAMA_CONFIG,
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    STABLELM_CONFIG,
+    # STARCODER_CONFIG, # broken
+]
+
+
 @pytest.mark.skipif(not is_quant_method_supported("gguf"),
                     reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     "bartowski/Llama-3.2-1B-Instruct-GGUF",
-     "Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     "bartowski/Llama-3.2-1B-Instruct-GGUF",
-     "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
-    ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
-     "qwen2-1_5b-instruct-q4_k_m.gguf"),
-    ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
-     "Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
-])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
-    num_gpus_available,
-    vllm_runner,
-    example_prompts,
-    original_model,
-    gguf_id,
-    gguf_path,
+    num_gpus_available: int,
+    vllm_runner: Type[VllmRunner],
+    example_prompts: List[str],
+    model: GGUFTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -51,28 +96,26 @@ def test_models(
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
-
-    tokenizer = AutoTokenizer.from_pretrained(original_model)
-    messages = [[{
-        'role': 'user',
-        'content': prompt
-    }] for prompt in example_prompts]
-    example_prompts = tokenizer.apply_chat_template(messages,
-                                                    tokenize=False,
-                                                    add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
+    if tokenizer.chat_template is not None:
+        messages = [[{
+            'role': 'user',
+            'content': prompt
+        }] for prompt in example_prompts]
+        example_prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True)
 
     # Run unquantized model.
-    with vllm_runner(model_name=original_model,
+    with vllm_runner(model_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as original_model:
-
         original_outputs = original_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
     # Run gguf model.
-    with vllm_runner(model_name=gguf_model,
+    with vllm_runner(model_name=model.gguf_model,
+                     tokenizer_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as gguf_model:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 48cfb1b221720..8876ca72792cf 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -447,8 +447,14 @@ def weight_loader(self,
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
         if is_gguf_weight_type:
-            param.data[loaded_shard_id].copy_(loaded_weight)
-            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            if loaded_shard_id is not None:
+                param.data[loaded_shard_id].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    i: loaded_weight.item()
+                    for i, _ in enumerate(self.output_sizes)
+                }
             return
 
         if is_gguf_weight:
@@ -459,15 +465,15 @@ def weight_loader(self,
             shard_size = loaded_weight.size(output_dim) // tp_size
             start_idx = tp_rank * shard_size
 
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
-
-            param.shard_id.append(loaded_shard_id)
-            param.shard_id_map[loaded_shard_id] = len(param.data_container)
-            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 2:
-                self.qweight = param.materialize_nested()
-            return
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 2:
+                    self.qweight = param.materialize_nested()
+                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -811,10 +817,16 @@ def weight_loader(self,
         # initialize GGUF param after we know the quantize type
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type and loaded_shard_id is not None:
+        if is_gguf_weight_type:
             idx_map = {"q": 0, "k": 1, "v": 2}
-            param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
-            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            if loaded_shard_id is not None:
+                param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    k: loaded_weight.item()
+                    for k in idx_map
+                }
             return
 
         if is_gguf_weight:
@@ -825,15 +837,15 @@ def weight_loader(self,
             shard_size = loaded_weight.size(output_dim) // tp_size
             start_idx = tp_rank * shard_size
 
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
-
-            param.shard_id.append(loaded_shard_id)
-            param.shard_id_map[loaded_shard_id] = len(param.data_container)
-            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 3:
-                self.qweight = param.materialize_nested()
-            return
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 3:
+                    self.qweight = param.materialize_nested()
+                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index fd926ff0254d4..1656a3cc9e46d 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -198,7 +198,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert not config.scale_attn_by_inverse_layer_idx
         assert not config.reorder_and_upcast_attn
         self.embed_dim = config.hidden_size
-        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          self.embed_dim,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.wte")
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
@@ -259,7 +262,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.transformer.wte
         else:
             self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size)
+                                          self.config.hidden_size,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.lm_head")
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -304,7 +309,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
+            if name.startswith("lm_head"):
                 # GPT-2 ties the weights of the embedding layer and the final
                 # linear layer.
                 continue
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8623da99574bb..17b0fbb777e8e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -156,7 +156,8 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
             is_neox_style = False
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 6b2107bef0a66..c9d1af78246a6 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -22,7 +22,7 @@
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import StableLmConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
@@ -50,8 +50,9 @@
 class StablelmMLP(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 config: StableLmConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -59,10 +60,13 @@ def __init__(self,
         self.gate_up_proj = MergedColumnParallelLinear(
             config.hidden_size, [config.intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(config.intermediate_size,
                                            config.hidden_size,
-                                           bias=False)
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -75,7 +79,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class StablelmAttention(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: StableLmConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -116,11 +120,13 @@ def __init__(self,
                                           self.total_num_heads,
                                           self.total_num_key_value_heads,
                                           self.qkv_bias,
-                                          quant_config=quant_config)
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
         self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
                                         self.hidden_size,
                                         bias=False,
-                                        quant_config=quant_config)
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.rotary_ndims,
@@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: StableLmConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -164,7 +170,7 @@ def __init__(
                                            cache_config,
                                            quant_config,
                                            prefix=f"{prefix}.self_attn")
-        self.mlp = StablelmMLP(config, quant_config)
+        self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
         norm_eps = getattr(config, "norm_eps",
                            getattr(config, "layer_norm_eps", 1e-05))
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
@@ -210,6 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -270,7 +278,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
-                                      quant_config=quant_config)
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 22189a517d313..1cd0dedfed2cb 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -88,12 +88,14 @@ def __init__(self,
             self.total_num_kv_heads,
             bias=self.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=self.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -129,19 +131,22 @@ class Starcoder2MLP(nn.Module):
 
     def __init__(self,
                  config: Starcoder2Config,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.c_fc = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             bias=config.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_proj = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             bias=config.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -165,7 +170,9 @@ def __init__(self,
                                              cache_config,
                                              quant_config=quant_config,
                                              prefix=f"{prefix}.self_attn")
-        self.mlp = Starcoder2MLP(config, quant_config=quant_config)
+        self.mlp = Starcoder2MLP(config,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp")
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.norm_epsilon)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
@@ -213,8 +220,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vocab_size = config.vocab_size
 
         # TODO: consider padding_idx (currently removed)
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Starcoder2DecoderLayer(
@@ -279,6 +289,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 org_num_embeddings=config.vocab_size,
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
+                prefix=f"{prefix}.lm_head",
             )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)

From 619ae268c3dc848a9b2f04579ea78ac5655f190f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 12 Jan 2025 23:54:10 -0500
Subject: [PATCH 097/115] [V1] [2/n] Logging and Metrics - `OutputProcessor`
 Abstraction (#11973)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_async_llm.py             |  69 +++++-
 ...etokenizer.py => test_output_processor.py} |  99 ++++++++-
 vllm/v1/engine/async_llm.py                   |  89 ++++----
 vllm/v1/engine/core_client.py                 |   6 +-
 vllm/v1/engine/detokenizer.py                 | 137 ++----------
 vllm/v1/engine/llm_engine.py                  |  33 ++-
 vllm/v1/engine/output_processor.py            | 200 ++++++++++++++++++
 vllm/v1/metrics/stats.py                      |  27 +++
 8 files changed, 450 insertions(+), 210 deletions(-)
 rename tests/v1/engine/{test_detokenizer.py => test_output_processor.py} (65%)
 create mode 100644 vllm/v1/engine/output_processor.py

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index fffb5b8100ec7..2c805e18eebae 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Tuple
+from typing import List, Tuple
 
 import pytest
 
@@ -13,6 +13,7 @@
                 allow_module_level=True)
 
 ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                              enforce_eager=True,
                               disable_log_requests=True)
 
 
@@ -53,17 +54,63 @@ async def test_load(monkeypatch):
                     generate(engine, request_id, NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
         for task in tasks:
             num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+        engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_abort(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 100
+        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: List[asyncio.Task] = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+
+        # API server cancels requests when they disconnect.
+        for idx in REQUEST_IDS_TO_ABORT:
+            tasks[idx].cancel()
+            await asyncio.sleep(0.1)
+
+        # Confirm the other requests are okay.
+        for idx, task in enumerate(tasks):
+            # Confirm that it was actually canceled.
+            if idx in REQUEST_IDS_TO_ABORT:
+                with pytest.raises(asyncio.CancelledError):
+                    await task
+            else:
+                # Otherwise, make sure the request was not impacted.
+                num_generated_tokens, request_id = await task
+                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can do another generation.
+        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
+        task = asyncio.create_task(
+            generate(engine, request_id, NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
 
         engine.shutdown()
diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_output_processor.py
similarity index 65%
rename from tests/v1/engine/test_detokenizer.py
rename to tests/v1/engine/test_output_processor.py
index aeae697ca32b0..4735c6f947537 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -3,11 +3,18 @@
 import pytest
 from transformers import AutoTokenizer
 
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
+                                              VLLM_CONFIG.scheduler_config,
+                                              VLLM_CONFIG.parallel_config,
+                                              VLLM_CONFIG.lora_config)
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
 FULL_STRINGS = [
@@ -66,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Add requests to the detokenizer.
     for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)
 
     gen_strings = {}
     gen_tokens = {}
@@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
             break
 
         # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs, )
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
         assert len(requests_to_abort) == 0
 
         # Update tracking.
@@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
         assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
         assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
 
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Add requests to the detokenizer.
     for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)
 
     gen_strings = {}
     aborted = []
@@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
             break
 
         # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
         for request_output in request_outputs:
             # If aborted, we should not get a request output.
             assert request_output.request_id not in aborted
@@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool):
             assert gen_str == ref_str_exc_stop, (
                 f"{gen_str=}, {ref_str_exc_stop=}")
 
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats():
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(),
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(GENERATION_TOKENS) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = sum(
+        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new request - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e0ceb59dffcbd..a74699f7513e6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -18,11 +18,11 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -59,9 +59,6 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> queue).
-        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
-
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
             model_config=vllm_config.model_config,
@@ -71,13 +68,9 @@ def __init__(
             input_registry=input_registry,
         )
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
-        )
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
@@ -140,9 +133,9 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         # 1) Create a new output queue for the request.
-        if request_id in self.rid_to_queue:
+        if self.output_processor.is_request_active(request_id):
             raise ValueError(f"Request id {request_id} already running.")
-        self.rid_to_queue[request_id] = asyncio.Queue()
+        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
         # 2) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
@@ -151,8 +144,8 @@ async def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(request)
+        # 3) Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, queue)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
@@ -160,7 +153,7 @@ async def add_request(
         if self.log_requests:
             logger.info("Added request %s.", request_id)
 
-        return self.rid_to_queue[request_id]
+        return queue
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -217,10 +210,9 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: both Detokenizer and EngineCore handle their
+                # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
                 if out.finished:
-                    del self.rid_to_queue[request_id]
                     yield out
                     break
 
@@ -233,57 +225,51 @@ async def generate(
             await self.abort(request_id)
             raise
 
-    def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request queues."""
-
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-
-            # Note: it is possible a request was aborted and removed from
-            # the state due to client cancellations, so if we encounter a
-            # request id not in the state, we skip.
-            if request_id in self.rid_to_queue:
-                self.rid_to_queue[request_id].put_nowait(request_output)
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
         try:
             while True:
-                # 1) Pull EngineCoreOutput from the EngineCore.
+                # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
-                # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(
+                # 2) Process EngineCoreOutputs.
+                processed_outputs = self.output_processor.process_outputs(
                     outputs.outputs)
+                # NOTE: RequestOutputs are pushed to their queues.
+                assert len(processed_outputs.request_outputs) == 0
 
-                # 3) Put the RequestOutputs into the per-request queues.
-                self._process_request_outputs(request_outputs)
+                # 3) Abort any reqs that finished due to stop strings.
+                await self.engine_core.abort_requests_async(
+                    processed_outputs.reqs_to_abort)
 
-                # 4) Abort any requests that finished due to stop strings.
-                await self.engine_core.abort_requests_async(reqs_to_abort)
-
-                # 5) Log any stats.
-                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
+                # 4) Logging.
+                # TODO(rob): make into a coroutine and launch it in
+                # background thread once we add Prometheus.
+                self._log_stats(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=processed_outputs.iteration_stats,
+                )
 
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
             kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
-        """Abort RequestId in self, detokenizer, and engine core."""
+        """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = [request_id]
         await self.engine_core.abort_requests_async(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
-        # If a request finishes while we await then the request_id
-        # will be removed from the tracked queues before we get here.
-        if request_id in self.rid_to_queue:
-            del self.rid_to_queue[request_id]
+        if self.log_requests:
+            logger.info("Aborted request %s.", request_id)
 
-    async def _log_stats(self, scheduler_stats: SchedulerStats):
-        """Log stats to the stat loggers."""
+    def _log_stats(
+        self,
+        scheduler_stats: SchedulerStats,
+        iteration_stats: IterationStats,
+    ):
         if not self.log_stats:
             return
 
@@ -314,8 +300,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        assert lora_request is None
-        return self.detokenizer.tokenizer
+        return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
         return False
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9d6ae725e9d2b..ac0f0f14bf1ab 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self.engine_core.abort_requests(request_ids)
+        if len(request_ids) > 0:
+            self.engine_core.abort_requests(request_ids)
 
     def shutdown(self):
         self.engine_core.shutdown()
@@ -221,7 +222,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self._send_input(EngineCoreRequestType.ABORT, request_ids)
+        if len(request_ids) > 0:
+            self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 65be9e58e03c8..4a8b61beec037 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,18 +1,25 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class DetokenizerOutput:
+    output_text: str
+    token_ids: List[int]
+    finished: bool
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+
+
 @dataclass
 class IncrementalDetokenizer:
 
@@ -20,6 +27,7 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
+    prompt_len: int
 
     # Stop strings
     stop: List[str]
@@ -34,11 +42,6 @@ class IncrementalDetokenizer:
     spaces_between_special_tokens: bool
     output_kind: RequestOutputKind
 
-    # TODO: Probably decouple these
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
@@ -48,8 +51,7 @@ class IncrementalDetokenizer:
 
     @property
     def output_token_ids(self) -> List[int]:
-        assert len(self.token_ids) >= len(self.prompt_token_ids)
-        return self.token_ids[len(self.prompt_token_ids):]
+        return self.token_ids[self.prompt_len:]
 
     @classmethod
     def from_new_request(
@@ -87,25 +89,25 @@ def from_new_request(
             spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
             output_kind=request.sampling_params.output_kind,
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
+            prompt_len=len(request.prompt_token_ids),
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
         )
 
-    def add_tokens(
+    def update_from_output(
         self,
-        new_token_ids: List[int],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
-    ) -> Optional[RequestOutput]:
+        output: EngineCoreOutput,
+    ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
             2) Update the RequestOutput with the new text.
         """
 
+        new_token_ids = output.new_token_ids
+        finish_reason = output.finish_reason
+        stop_reason = output.stop_reason
+
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
@@ -158,21 +160,8 @@ def add_tokens(
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
 
-        request_output = RequestOutput.new(
-            self.request_id,
-            self.prompt,
-            self.prompt_token_ids,
-            output_text,
-            token_ids,
-            finished,
-        )
-
-        if finished:
-            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = finish_reason
-            completion_output.stop_reason = stop_reason
-
-        return request_output
+        return DetokenizerOutput(output_text, token_ids, finished,
+                                 finish_reason, stop_reason)
 
     def _get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
@@ -189,85 +178,3 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
         return ""
-
-
-class Detokenizer:
-
-    def __init__(self,
-                 tokenizer_name: str,
-                 tokenizer_mode: str = "auto",
-                 trust_remote_code: bool = False,
-                 revision: Optional[str] = None):
-        # TODO: once we support LoRA, we should should pass the tokenizer
-        # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                                       tokenizer_mode=tokenizer_mode,
-                                       trust_remote_code=trust_remote_code,
-                                       revision=revision)
-
-        # Request id -> IncrementalDetokenizer
-        self.request_states: Dict[str, IncrementalDetokenizer] = {}
-
-    def is_request_active(self, request_id: str):
-        return request_id in self.request_states
-
-    def get_num_unfinished_requests(self):
-        return len(self.request_states)
-
-    def has_unfinished_requests(self) -> bool:
-        return len(self.request_states) > 0
-
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> None:
-        """Remove the request_ids from the Detokenizer."""
-
-        for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def add_request(
-        self,
-        request: EngineCoreRequest,
-    ):
-        """Add new request to the Detokenizer."""
-
-        assert (request.request_id not in self.request_states)
-
-        request_state = IncrementalDetokenizer.from_new_request(
-            self.tokenizer, request)
-        self.request_states[request.request_id] = request_state
-
-    def step(
-        self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> Tuple[List[RequestOutput], List[str]]:
-        """Update state and request the RequestOutputs to the LLMEngine."""
-
-        request_outputs: List[RequestOutput] = []
-        requests_to_abort: List[str] = []
-        for engine_core_output in encore_core_outputs:
-            request_id = engine_core_output.request_id
-            detokenizer = self.request_states.get(request_id)
-            if detokenizer is None:
-                # Ignore output for already-aborted request.
-                continue
-
-            # Detokenize and update state.
-            request_output = detokenizer.add_tokens(
-                new_token_ids=engine_core_output.new_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
-            )
-
-            if request_output is not None:
-                # Add to RequestOutputs list.
-                request_outputs.append(request_output)
-
-                # Free completed requests.
-                if request_output.finished:
-                    self.request_states.pop(request_id)
-                    if not engine_core_output.finished:
-                        requests_to_abort.append(request_id)
-
-        # Return to EngineClient.
-        return request_outputs, requests_to_abort
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ac392f5e4f4cf..f5999ccda6447 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -18,7 +18,7 @@
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -60,13 +60,9 @@ def __init__(
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
-        )
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=False)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
@@ -103,10 +99,10 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.detokenizer.get_num_unfinished_requests()
+        return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.detokenizer.has_unfinished_requests()
+        return self.output_processor.has_unfinished_requests()
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -116,7 +112,7 @@ def abort_request(self, request_ids: List[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
         self.engine_core.abort_requests(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
     def add_request(
         self,
@@ -137,8 +133,8 @@ def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(request)
+        # 2) Make a new RequestState and queue.
+        self.output_processor.add_request(request)
 
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)
@@ -148,15 +144,14 @@ def step(self) -> List[RequestOutput]:
         # 1) Get EngineCoreOutput from the EngineCore.
         outputs = self.engine_core.get_output()
 
-        # 2) Detokenizer the EngineCoreOutput.
-        request_outputs, requests_to_abort = self.detokenizer.step(
+        # 2) Process EngineCoreOutputs.
+        processed_outputs = self.output_processor.process_outputs(
             outputs.outputs)
 
-        # 3) Abort requests that finished due to stopping criteria.
-        if requests_to_abort:
-            self.abort_request(requests_to_abort)
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
-        return request_outputs
+        return processed_outputs.request_outputs
 
     def get_model_config(self):
         return self.model_config
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
new file mode 100644
index 0000000000000..749f4f5043c97
--- /dev/null
+++ b/vllm/v1/engine/output_processor.py
@@ -0,0 +1,200 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import (DetokenizerOutput,
+                                        IncrementalDetokenizer)
+from vllm.v1.metrics.stats import IterationStats
+
+
+@dataclass
+class OutputProcessorOutput:
+
+    request_outputs: List[RequestOutput]
+    reqs_to_abort: List[str]
+    iteration_stats: IterationStats
+
+
+class RequestState:
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        detokenizer: IncrementalDetokenizer,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+    ):
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.detokenizer = detokenizer
+        self.is_prefilling = True
+        self.queue = queue
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> "RequestState":
+        return cls(
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            detokenizer=IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            queue=queue,
+        )
+
+
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+
+    def __init__(
+        self,
+        tokenizer: BaseTokenizerGroup,
+        log_stats: bool,
+    ):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: Dict[str, RequestState] = {}
+
+    def is_request_active(self, request_id: str) -> bool:
+        return request_id in self.request_states
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def abort_requests(
+        self,
+        request_ids: List[str],
+    ) -> None:
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        self.request_states[request_id] = RequestState.from_new_request(
+            tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
+            request=request,
+            queue=queue)
+
+    def process_outputs(
+        self,
+        engine_core_outputs: List[EngineCoreOutput],
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM), 
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine), 
+              return a list of RequestOutput objects.
+
+        ****************** NOTE FOR DEVELOPERS ******************
+
+        VLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the 
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, implement a
+        method called XXXClass.update_from_output() to be called
+        within the loop below. For examples, see:
+            * IterationStats.update_from_output()
+            * Detokenizer.update_from_output()
+        
+        TODO(rob): add Protocol makes update_from_output explicit.
+        
+        **********************************************************
+        """
+
+        request_outputs: List[RequestOutput] = []
+        reqs_to_abort: List[str] = []
+        iteration_stats = IterationStats(self.log_stats)
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # 1) Compute stats for this iteration.
+            iteration_stats.update_from_output(engine_core_output,
+                                               req_state.is_prefilling,
+                                               req_state.prompt_len)
+            req_state.is_prefilling = False
+
+            # 2) Detokenize the token ids into text.
+            detokenizer_output = req_state.detokenizer.update_from_output(
+                engine_core_output)
+
+            # 3) Create and handle RequestOutput objects.
+            if request_output := self._make_request_output(
+                    req_state, detokenizer_output):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put_nowait(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+
+                # Free completed requests.
+                if request_output.finished:
+                    self.request_states.pop(req_id)
+                    if not engine_core_output.finished:
+                        # If req not finished in EngineCore, but Detokenizer
+                        # detected stop string, abort needed in EngineCore.
+                        reqs_to_abort.append(req_id)
+
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+            iteration_stats=iteration_stats,
+        )
+
+    def _make_request_output(
+        self,
+        request_state: RequestState,
+        detokenizer_output: Optional[DetokenizerOutput],
+    ) -> Optional[RequestOutput]:
+
+        if detokenizer_output is None:
+            return None
+
+        request_output = RequestOutput.new(
+            request_state.request_id,
+            request_state.prompt,
+            request_state.prompt_token_ids,
+            detokenizer_output.output_text,
+            detokenizer_output.token_ids,
+            detokenizer_output.finished,
+        )
+        if detokenizer_output.finished:
+            completion_output = request_output.outputs[0]
+            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.stop_reason = detokenizer_output.stop_reason
+
+        return request_output
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5ebb4fd5b37db..60cb986f8bbce 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,4 +1,8 @@
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.v1.engine import EngineCoreOutput
 
 
 @dataclass
@@ -10,3 +14,26 @@ class SchedulerStats:
 
     # gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
+
+
+class IterationStats:
+    """Stats associated with a single set of EngineCoreOutputs."""
+
+    def __init__(self, log_stats: bool):
+        self.log_stats = log_stats
+        self.num_generation_tokens = 0
+        self.num_prompt_tokens = 0
+
+    def update_from_output(self, output: "EngineCoreOutput",
+                           is_prefilling: bool, prompt_len: int):
+        if not self.log_stats:
+            return
+
+        self.num_generation_tokens += len(output.new_token_ids)
+        if is_prefilling:
+            # This relies on the invariant that EngineCore does
+            # not stream outputs for partially completed prefills
+            # (scheduler.update_from_output makes EngineCoreOutput
+            # iff num_computed_tokens == num_tokens).
+            assert (len(output.new_token_ids) > 0)
+            self.num_prompt_tokens += prompt_len

From f7b3ba82c3eec71f31f8d49f708ab328b5e908f6 Mon Sep 17 00:00:00 2001
From: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Date: Mon, 13 Jan 2025 13:07:48 +0800
Subject: [PATCH 098/115] [MISC] fix typo in kv transfer send recv test
 (#11983)

---
 tests/kv_transfer/test_send_recv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 4beba4dc05dde..1cc1ced9968d7 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -22,13 +22,13 @@ def test_run(my_rank, pipe):
         x2 = pipe.recv_tensor()
         print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
 
     else:
         x2 = pipe.recv_tensor()
         print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
         pipe.send_tensor(x)
         print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)

From 9dd02d85ca801c99241317a8061bd025c726af93 Mon Sep 17 00:00:00 2001
From: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:24:10 +0800
Subject: [PATCH 099/115] [Bug] Fix usage of `.transpose()` and `.view()`
 consecutively. (#11979)

---
 vllm/attention/layer.py                  | 2 +-
 vllm/model_executor/models/intern_vit.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 55e4e14027f79..b8afd428f2cc0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -230,7 +230,7 @@ def forward(
                                                  value,
                                                  scale=self.scale)
             out = out.transpose(1, 2)
-        return out.view(bsz, q_len, -1)
+        return out.reshape(bsz, q_len, -1)
 
 
 def unified_attention(
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 7ff68bd60e8ad..8ad009d5101e4 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -271,7 +271,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         v = v.transpose(1, 2)
 
         x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).view(B, N, -1)
+        x = x.transpose(1, 2).reshape(B, N, -1)
 
         x = self.proj(x)
         return x

From 80ea3af1a06ff445e6cdede072bda1429c9dac06 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:50:35 +0900
Subject: [PATCH 100/115] [CI][Spec Decode] fix: broken test for EAGLE model
 (#11972)

Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml       |  4 +++-
 vllm/model_executor/models/eagle.py | 11 ++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 393912881bca3..74b287c7adbfa 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -231,13 +231,15 @@ steps:
     - pytest -v -s test_logits_processor.py
     - pytest -v -s model_executor/test_guided_processors.py
 
-- label: Speculative decoding tests # 30min
+- label: Speculative decoding tests # 40min
   source_file_dependencies:
   - vllm/spec_decode
   - tests/spec_decode
+  - vllm/model_executor/models/eagle.py
   commands:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
     - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
   mirror_hardwares: [amd]
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index eb7b5af19ae96..948560b4906b8 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -19,6 +19,11 @@
 
 class DummyInputLayerNorm(nn.Module):
 
+    def __init__(self, weight=None, bias=None):
+        super().__init__()
+        self.weight = nn.Parameter(weight) if weight is not None else None
+        self.bias = nn.Parameter(bias) if bias is not None else None
+
     def forward(self, x):
         return x
 
@@ -69,7 +74,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # Modify layer normalization and residual connections as suggested
         # in the EAGLE framework: https://github.com/SafeAILab/EAGLE
-        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm()
+        # While weights and biases are generally not needed,
+        # they are retained here to support certain unit tests
+        # (e.g., spec_decode/e2e/test_eagle_correctness.py).
+        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
+            weight=self.model.model.layers[0].input_layernorm.weight)
         self.model.model.norm = DummyOutputNorm()
 
         self.orig_vocab_size = config.vocab_size

From cf6bbcb49324c24fc0f6f9381400c299c9c2d7ac Mon Sep 17 00:00:00 2001
From: Concurrensee <yida.wu@amd.com>
Date: Mon, 13 Jan 2025 01:05:06 -0600
Subject: [PATCH 101/115] [Misc] Fix Deepseek V2 fp8 kv-scale remapping
 (#11947)

Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
---
 vllm/model_executor/models/deepseek_v2.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 9132040545863..d83cafaf998ab 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -45,7 +45,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -635,6 +636,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     if name.endswith(".bias") and name not in params_dict:
                         continue
 
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
                     if is_pp_missing_parameter(name, self):
                         continue
 

From c3f05b09a040b9d13ad62914be3f7a84c535e417 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Mon, 13 Jan 2025 15:47:05 +0800
Subject: [PATCH 102/115] [Misc]Minor Changes about Worker (#11555)

Signed-off-by: Chenguang Li <757486878@qq.com>
---
 vllm/v1/worker/gpu_worker.py | 1 -
 vllm/worker/worker.py        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index af438f7d5820c..e83bce4283555 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -132,7 +132,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         self.model_runner.profile_run()
-        torch.cuda.synchronize()
 
         free_gpu_memory, _ = torch.cuda.mem_get_info()
         # NOTE(woosuk): Here we assume that the other processes using the same
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0f12549e3f3fd..a3e377ef2b19d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -200,7 +200,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                               weights_memory_in_bytes=self.model_runner.
                               model_memory_usage) as result:
             self.model_runner.profile_run()
-            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 

From 89ce62a316e68c50121e74d5a832e0cb9a5101d1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 13 Jan 2025 16:20:52 +0800
Subject: [PATCH 103/115] [platform] add ray_device_key (#11948)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/ray_utils.py    | 19 +++++++++++++------
 vllm/platforms/cuda.py        |  1 +
 vllm/platforms/hpu.py         |  1 +
 vllm/platforms/interface.py   |  4 ++++
 vllm/platforms/neuron.py      |  1 +
 vllm/platforms/rocm.py        |  2 ++
 vllm/platforms/tpu.py         |  2 ++
 vllm/platforms/xpu.py         |  3 +++
 vllm/v1/executor/ray_utils.py | 13 +++++++++++--
 9 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 8d766bad1a072..9f40f6a65dcd7 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -8,6 +8,7 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -47,7 +48,12 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            gpu_ids = ray.get_gpu_ids()
+            device_key = current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
             return node_id, gpu_ids
 
         def execute_model_spmd(
@@ -249,11 +255,12 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
-    device_str = "GPU"
-    if current_platform.is_tpu():
-        device_str = "TPU"
-    elif current_platform.is_hpu():
-        device_str = 'HPU'
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
+
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 23ceac83e49de..3f77ec50ed31f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -77,6 +77,7 @@ class CudaPlatformBase(Platform):
     device_name: str = "cuda"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
 
     @classmethod
     def get_device_capability(cls,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 8152d881fa8d9..0acb2804a5f66 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -19,6 +19,7 @@ class HpuPlatform(Platform):
     device_name: str = "hpu"
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
+    ray_device_key: str = "HPU"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index fe398801c5dd9..ec917f75689dd 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -82,6 +82,10 @@ class Platform:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
     # The torch.compile backend for compiling simple and
     # standalone functions. The default value is "inductor" to keep
     # the same behavior as PyTorch.
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index a4bbbd27c8a89..7f4a867b32ba1 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -16,6 +16,7 @@ class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
     device_name: str = "neuron"
     device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
     supported_quantization: list[str] = ["neuron_quant"]
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1c2f602efc856..f12e948113723 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -64,6 +64,8 @@ class RocmPlatform(Platform):
     device_name: str = "rocm"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
         "fbgemm_fp8", "gguf"
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8a59b53ca4b15..460eb170bba34 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,6 +19,8 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+
     supported_quantization: list[str] = [
         "tpu_int8", "compressed-tensors", "compressed_tensors"
     ]
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 00692a5d23031..cb74f79b31794 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -19,6 +19,9 @@ class XPUPlatform(Platform):
     device_name: str = "xpu"
     device_type: str = "xpu"
     dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 7733610e59c7f..fc9715b7a5909 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -41,7 +41,12 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            gpu_ids = ray.get_gpu_ids()
+            device_key = current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
             return node_id, gpu_ids
 
         def setup_device_if_necessary(self):
@@ -211,7 +216,11 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
-    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:

From 5340a30d0193547a19e236757fec1f3f246642f9 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 13 Jan 2025 01:37:48 -0700
Subject: [PATCH 104/115] Fix Max Token ID for Qwen-VL-Chat (#11980)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
---
 vllm/transformers_utils/tokenizer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 97920f42ec52f..294262484f2fb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,3 +1,4 @@
+import contextlib
 import os
 import warnings
 from pathlib import Path
@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
         tokenizer.all_special_tokens_extended)
     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
     tokenizer_len = len(tokenizer)
+
     max_token_id = max(tokenizer.get_vocab().values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
 
     class CachedTokenizer(tokenizer.__class__):  # type: ignore
 

From 0f8cafe2d1550a33803fb64b2224e6bf3f913449 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 13 Jan 2025 19:28:53 +0800
Subject: [PATCH 105/115] [Kernel] unified_attention for Attention.forward
 (#11967)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/layer.py              | 26 ++++++++++++++------------
 vllm/utils.py                        |  1 -
 vllm/worker/hpu_model_runner.py      | 13 +++++++++++--
 vllm/worker/hpu_worker.py            |  3 +++
 vllm/worker/neuron_model_runner.py   | 17 ++++++++++-------
 vllm/worker/openvino_model_runner.py |  4 +++-
 vllm/worker/openvino_worker.py       | 13 +++++++++++--
 vllm/worker/tpu_model_runner.py      | 28 ++++++++++++++++++----------
 vllm/worker/tpu_worker.py            |  6 +++++-
 vllm/worker/xpu_model_runner.py      | 21 ++++++++++++---------
 10 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b8afd428f2cc0..c7e7a4d52e5a7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -134,15 +134,10 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
+        _kv_cache: torch.Tensor,
+        _attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-
-        if self.use_direct_call:
-            return self.impl.forward(query, key, value, kv_cache,
-                                     attn_metadata, self._k_scale,
-                                     self._v_scale)
-        elif self.use_output:
+        if self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
             # Reshape the query, key, and value tensors.
@@ -154,12 +149,19 @@ def forward(
                 key = key.view(-1, self.num_kv_heads, self.head_size)
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
-            torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, self.layer_name)
+            if self.use_direct_call:
+                unified_attention_with_output(query, key, value, output,
+                                              self.layer_name)
+            else:
+                torch.ops.vllm.unified_attention_with_output(
+                    query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
-            return torch.ops.vllm.unified_attention(query, key, value,
-                                                    self.layer_name)
+            if self.use_direct_call:
+                return unified_attention(query, key, value, self.layer_name)
+            else:
+                return torch.ops.vllm.unified_attention(
+                    query, key, value, self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
diff --git a/vllm/utils.py b/vllm/utils.py
index 217ccb25cef6d..9a509da3c1ef1 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2171,5 +2171,4 @@ def bind_kv_cache(
         forward_ctx = ctx[layer_name]
         assert len(forward_ctx.kv_cache) == len(kv_cache)
         for ve, ve_kv_cache in enumerate(kv_cache):
-            assert forward_ctx.kv_cache[ve].numel() == 0
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 9d479f412af46..3e5105f3b62e3 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -28,6 +28,7 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.distributed.parallel_state import get_world_group
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -40,7 +41,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils import (bind_kv_cache, is_pin_memory_available,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1286,6 +1288,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [kv_caches])
         max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
         max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
                              self.scheduler_config.max_num_seqs)
@@ -1943,7 +1948,11 @@ def execute_model(
                                 f"graphs{'T' if use_graphs else 'F'}")
         else:
             model_event_name = 'model_executable'
-        with self.profiler.record_event('internal', model_event_name):
+        with set_forward_context(
+                model_input.attn_metadata, self.vllm_config,
+                model_input.virtual_engine), \
+            self.profiler.record_event(
+                    'internal', model_event_name):
             hidden_states = self.model.forward(
                 **execute_model_kwargs,
                 selected_token_indices=sampling_metadata.selected_token_indices
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index cca7cd50bfc7b..8b2d8aaed2803 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -20,6 +20,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
+from vllm.utils import bind_kv_cache
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.hpu_model_runner import HPUModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
@@ -215,6 +216,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.hpu_cache)
 
     def _warm_up_model(self) -> None:
         # NOTE(kzawora): We should use virtual engine index here
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index ae4eb6ba6eaec..a35f5467e1a1f 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -8,6 +8,7 @@
 from transformers_neuronx.config import GenerationConfig
 
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -314,13 +315,15 @@ def execute_model(
             raise ValueError(
                 "NeuronModelRunner does not support multi-step execution.")
 
-        hidden_states = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
-        )
+        with set_forward_context(None, self.vllm_config, 0):
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
+                                             or {},
+                                             device=self.device),
+            )
 
         # Compute the logits only if the on-device sampling is turned off as
         # on-device sampling outputs the token ids.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 6000e5dfe4e30..a38b5a4e6e8d5 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -8,6 +8,7 @@
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -350,7 +351,8 @@ def execute_model(
                                          device=self.device),
         }
 
-        hidden_states = model_executable(**execute_model_kwargs)
+        with set_forward_context(attn_metadata, self.vllm_config, 0):
+            hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 0bf522d5333ed..3482073566215 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -20,6 +20,7 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
+from vllm.utils import bind_kv_cache
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
@@ -339,6 +340,8 @@ def _init_cache_engine(self) -> None:
             ov_device,
         )
         self.kv_cache = self.cache_engine.kv_cache
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [self.kv_cache])
         self.model_runner.block_size = self.cache_engine.block_size
 
         assert self.kv_cache is not None
@@ -507,12 +510,18 @@ def model_profile_run():
 
             self.model_runner.block_size = tmp_cache_config.block_size
 
+            bind_kv_cache(self.compilation_config.static_forward_context,
+                          profiling_cache_engine.kv_cache)
             # Run the model with the dummy inputs.
             self.model_runner.execute_model(seqs,
                                             profiling_cache_engine.kv_cache)
 
-            # explicitly delete temporary KV cache manager to free KV cache
-            # when real inputs will be passed to OV
+            # Explicitly revert bind_kv_cache and delete temporary KV cache
+            # manager to free KV cache when real inputs will be passed to OV
+            bind_kv_cache(self.compilation_config.static_forward_context, [[
+                torch.tensor([])
+                for _ in range(len(profiling_cache_engine.kv_cache))
+            ]])
             del profiling_cache_engine
 
             logger.info(
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 7bdb7f0e2d6a9..52c577bccab9c 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -265,8 +266,9 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(t, 0)
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
-        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
-                   num_samples, kv_caches)
+        with set_forward_context(attn_metadata, self.vllm_config, 0):
+            self.model(token_ids, position_ids, attn_metadata, input_lens, t,
+                       p, num_samples, kv_caches)
 
     def warmup_model(
         self,
@@ -663,10 +665,13 @@ def execute_model(
                 input_lens = model_input.input_lens[i:i + 1].to(self.device)
                 t = model_input.t[i:i + 1].to(self.device)
                 p = model_input.p[i:i + 1].to(self.device)
-                output_token_ids = self.model(token_ids, position_ids,
-                                              attn_metadata, input_lens, t, p,
-                                              model_input.num_samples,
-                                              kv_caches)
+                with set_forward_context(model_input.attn_metadata,
+                                         self.vllm_config,
+                                         model_input.virtual_engine):
+                    output_token_ids = self.model(token_ids, position_ids,
+                                                  attn_metadata, input_lens, t,
+                                                  p, model_input.num_samples,
+                                                  kv_caches)
                 next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
 
@@ -711,10 +716,13 @@ def execute_model(
             input_lens = model_input.input_lens.to(self.device)
             for i in range(num_steps):
                 slot_mapping = attn_metadata.slot_mapping
-                output_token_ids = self.model(token_ids, position_ids,
-                                              attn_metadata, input_lens, t, p,
-                                              model_input.num_samples,
-                                              kv_caches)
+                with set_forward_context(model_input.attn_metadata,
+                                         self.vllm_config,
+                                         model_input.virtual_engine):
+                    output_token_ids = self.model(token_ids, position_ids,
+                                                  attn_metadata, input_lens, t,
+                                                  p, model_input.num_samples,
+                                                  kv_caches)
                 self.cached_step_outputs.append(output_token_ids)
 
                 if i < num_steps - 1:
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 8754f7538f251..ea0e700545b16 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size
 from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
@@ -108,6 +108,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                       torch.tensor([], dtype=torch.float32,
                                    device=self.device))
                      for _ in range(num_layers)]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [kv_caches])
         self.model_runner._dummy_run(
             batch_size=1,
             seq_len=self.scheduler_config.max_num_batched_tokens,
@@ -170,6 +172,8 @@ def initialize_cache(
                                       device="cpu")
             cpu_v_cache = torch.zeros_like(cpu_k_cache)
             self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [self.tpu_cache])
         self._warmup_model()
 
     def _warmup_model(self) -> None:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9cf25387560da..82b8f22a5af33 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -12,6 +12,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadataCache
@@ -562,15 +563,17 @@ def execute_model(
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start_time = time.time()
-
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device))
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
+                                             or {},
+                                             device=self.device))
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states

From cd8249903f189c5f06424e67dbc6512ca494a046 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 13 Jan 2025 03:58:54 -0800
Subject: [PATCH 106/115] [Doc][V1] Update model implementation guide for V1
 support (#11998)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/source/contributing/model/basic.md      | 12 ++-
 docs/source/contributing/model/multimodal.md | 87 ++++++++++++++++----
 2 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 5c2dc486c8bea..b9b92fd027f6e 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -57,7 +57,17 @@ class MyModelForCausalLM(nn.Module):
 
 ### Computation Code
 
-Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+
+```python
+class MyModel(nn.Module):
+        ...
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        ... 
+```
+
+- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
 
 ```python
 def forward(
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 99f6a1d5462c4..e5fd9a2877ceb 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -9,7 +9,78 @@ This document walks you through the steps to extend a basic model so that it acc
 It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
 Further update the model as follows:
 
-- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+  
+  More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it.
+
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+
+            assert self.vision_encoder is not None
+            image_features = self.vision_encoder(image_input)
+            return self.multi_modal_projector(image_features)
+
+        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+
+            # Validate the multimodal input keyword arguments
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                return None
+
+            # Run multimodal inputs through encoder and projector
+            vision_embeddings = self._process_image_input(image_input)
+            return vision_embeddings
+    ```
+
+    ```{important}
+    The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
+    ```
+
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
+
+    ```python
+    from .utils import merge_multimodal_embeddings
+
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_input_embeddings(
+            self,
+            input_ids: torch.Tensor,
+            multimodal_embeddings: Optional[NestedTensors] = None,
+        ) -> torch.Tensor:
+
+            # `get_input_embeddings` should already be implemented for the language 
+            # model as one of the requirements of basic vLLM model implementation.
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            if multimodal_embeddings is not None:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids=input_ids, 
+                    inputs_embeds=inputs_embeds, 
+                    multimodal_embeddings=multimodal_embeddings,
+                    placeholder_token_id=self.config.image_token_index)
+
+            return inputs_embeds
+    ```
+
+- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
   ```diff
   + from vllm.model_executor.models.interfaces import SupportsMultiModal
@@ -23,20 +94,6 @@ Further update the model as follows:
   Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
   ```
 
-- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward`
-  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-
-  ```diff
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-  +     pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
-  ```
-
 ## 2. Specify processing information
 
 Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`

From e8c23ff989d4b061726315bbf74d0bca7136fdc4 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 13 Jan 2025 12:27:36 +0000
Subject: [PATCH 107/115] [Doc] Organise installation documentation into
 categories and tabs (#11935)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/conf.py                           |   2 +-
 docs/source/deployment/docker.md              |   4 +
 docs/source/features/compatibility_matrix.md  |   4 +-
 .../hpu-gaudi.inc.md}                         |  72 ++--
 .../installation/ai_accelerator/index.md      | 375 ++++++++++++++++++
 .../neuron.inc.md}                            |  67 ++--
 .../openvino.inc.md}                          |  78 ++--
 .../{tpu.md => ai_accelerator/tpu.inc.md}     |  38 +-
 .../getting_started/installation/cpu-arm.md   |  46 ---
 .../{cpu-apple.md => cpu/apple.inc.md}        |  28 +-
 .../installation/cpu/arm.inc.md               |  30 ++
 .../installation/cpu/build.inc.md             |  21 +
 .../installation/{cpu-x86.md => cpu/index.md} | 190 ++++++---
 .../installation/cpu/x86.inc.md               |  35 ++
 .../installation/device.template.md           |  17 +
 .../{gpu-cuda.md => gpu/cuda.inc.md}          |  96 ++---
 .../getting_started/installation/gpu/index.md | 300 ++++++++++++++
 .../{gpu-rocm.md => gpu/rocm.inc.md}          | 147 +++----
 .../installation/{xpu.md => gpu/xpu.inc.md}   |  51 ++-
 .../getting_started/installation/index.md     |  13 +-
 .../installation/python_env_setup.inc.md      |  19 +
 21 files changed, 1241 insertions(+), 392 deletions(-)
 rename docs/source/getting_started/installation/{hpu-gaudi.md => ai_accelerator/hpu-gaudi.inc.md} (97%)
 create mode 100644 docs/source/getting_started/installation/ai_accelerator/index.md
 rename docs/source/getting_started/installation/{neuron.md => ai_accelerator/neuron.inc.md} (86%)
 rename docs/source/getting_started/installation/{openvino.md => ai_accelerator/openvino.inc.md} (69%)
 rename docs/source/getting_started/installation/{tpu.md => ai_accelerator/tpu.inc.md} (88%)
 delete mode 100644 docs/source/getting_started/installation/cpu-arm.md
 rename docs/source/getting_started/installation/{cpu-apple.md => cpu/apple.inc.md} (70%)
 create mode 100644 docs/source/getting_started/installation/cpu/arm.inc.md
 create mode 100644 docs/source/getting_started/installation/cpu/build.inc.md
 rename docs/source/getting_started/installation/{cpu-x86.md => cpu/index.md} (67%)
 create mode 100644 docs/source/getting_started/installation/cpu/x86.inc.md
 create mode 100644 docs/source/getting_started/installation/device.template.md
 rename docs/source/getting_started/installation/{gpu-cuda.md => gpu/cuda.inc.md} (84%)
 create mode 100644 docs/source/getting_started/installation/gpu/index.md
 rename docs/source/getting_started/installation/{gpu-rocm.md => gpu/rocm.inc.md} (87%)
 rename docs/source/getting_started/installation/{xpu.md => gpu/xpu.inc.md} (80%)
 create mode 100644 docs/source/getting_started/installation/python_env_setup.inc.md

diff --git a/docs/source/conf.py b/docs/source/conf.py
index bff0141ffbce8..7aa52db092e36 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -56,7 +56,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.md"]
+exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index c735bfd0e87a7..9e301483ef7f9 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -2,6 +2,8 @@
 
 # Using Docker
 
+(deployment-docker-pre-built-image)=
+
 ## Use vLLM's Official Docker Image
 
 vLLM offers an official Docker image for deployment.
@@ -23,6 +25,8 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
 memory to share data between processes under the hood, particularly for tensor parallel inference.
 ```
 
+(deployment-docker-build-image-from-source)=
+
 ## Building vLLM's Docker Image from Source
 
 You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index ea1d545ff3d73..86a82eb36df33 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -322,7 +322,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 
 ```
 
-### Feature x Hardware
+(feature-x-hardware)=
+
+## Feature x Hardware
 
 ```{list-table}
    :header-rows: 1
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
similarity index 97%
rename from docs/source/getting_started/installation/hpu-gaudi.md
rename to docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index a829b1c9ff996..b4695d504b601 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -1,10 +1,13 @@
-(installation-gaudi)=
+# Installation
 
-# Installation for Intel® Gaudi®
+This tab provides instructions on running vLLM with Intel Gaudi devices.
 
-This README provides instructions on running vLLM with Intel Gaudi devices.
+## Requirements
 
-## Requirements and Installation
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
 
 Please follow the instructions provided in the [Gaudi Installation
 Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
@@ -12,27 +15,9 @@ to set up the execution environment. To achieve the best performance,
 please follow the methods outlined in the [Optimizing Training Platform
 Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-### Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-### Quick start using Dockerfile
-
-```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-```
-
-```{tip}
-If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+## Configure a new environment
 
-### Build from source
-
-#### Environment verification
+### Environment verification
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -47,7 +32,7 @@ Refer to [Intel Gaudi Software Stack
 Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 
-#### Run Docker Image
+### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
 vault. Refer to the [Intel Gaudi
@@ -61,7 +46,13 @@ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-i
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
-#### Build and Install vLLM
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built Intel Gaudi wheels.
+
+### Build wheel from source
 
 To build and install vLLM from source, run:
 
@@ -80,7 +71,26 @@ git checkout habana_main
 python setup.py develop
 ```
 
-## Supported Features
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Intel Gaudi images.
+
+### Build image from source
+
+```console
+docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+```{tip}
+If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+```
+
+## Extra information
+
+## Supported features
 
 - [Offline inference](#offline-inference)
 - Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
@@ -94,14 +104,14 @@ python setup.py develop
   for accelerating low-batch latency and throughput
 - Attention with Linear Biases (ALiBi)
 
-## Unsupported Features
+## Unsupported features
 
 - Beam search
 - LoRA adapters
 - Quantization
 - Prefill chunking (mixed-batch inferencing)
 
-## Supported Configurations
+## Supported configurations
 
 The following configurations have been validated to be function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
@@ -137,7 +147,7 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 
-## Performance Tuning
+## Performance tuning
 
 ### Execution modes
 
@@ -368,7 +378,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 
-## Troubleshooting: Tweaking HPU Graphs
+## Troubleshooting: tweaking HPU graphs
 
 If you experience device out-of-memory issues or want to attempt
 inference at higher batch sizes, try tweaking HPU Graphs by following
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
new file mode 100644
index 0000000000000..a6c4c44305a4c
--- /dev/null
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -0,0 +1,375 @@
+# Other AI accelerators
+
+vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+::::
+
+## Requirements
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+::::
+
+## Configure a new environment
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} ../python_env_setup.inc.md
+```
+
+:::
+
+::::
+
+## Set up using Python
+
+### Pre-built wheels
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+::::
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+::::
+
+### Build image from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+::::
+
+## Extra information
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+::::
diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
similarity index 86%
rename from docs/source/getting_started/installation/neuron.md
rename to docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 5581b1940ca46..575a9f9c2e2f0 100644
--- a/docs/source/getting_started/installation/neuron.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -1,6 +1,4 @@
-(installation-neuron)=
-
-# Installation for Neuron
+# Installation
 
 vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
@@ -14,28 +12,9 @@ Data types currently supported in Neuron SDK are FP16 and BF16.
 - Pytorch 2.0.1/2.1.1
 - AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
 
-Installation steps:
-
-- [Build from source](#build-from-source-neuron)
-
-  - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
-  - [Step 1. Install drivers and tools](#install-drivers)
-  - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
-  - [Step 3. Install vLLM from source](#install-vllm)
-
-(build-from-source-neuron)=
-
-```{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
-
-## Build from source
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-(launch-instances)=
+## Configure a new environment
 
-### Step 0. Launch Trn1/Inf2 instances
+### Launch Trn1/Inf2 instances
 
 Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
 
@@ -45,9 +24,7 @@ Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch N
 - When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
 - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
 
-(install-drivers)=
-
-### Step 1. Install drivers and tools
+### Install drivers and tools
 
 The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
 
@@ -82,9 +59,21 @@ sudo apt-get install aws-neuronx-tools=2.* -y
 export PATH=/opt/aws/neuron/bin:$PATH
 ```
 
-(install-tnx)=
+## Set up using Python
+
+### Pre-built wheels
 
-### Step 2. Install transformers-neuronx and its dependencies
+Currently, there are no pre-built Neuron wheels.
+
+### Build wheel from source
+
+```{note}
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+```
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+#### Install transformers-neuronx and its dependencies
 
 [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
 Follow the steps below to install transformer-neuronx package and its dependencies.
@@ -116,9 +105,7 @@ python -m pip install awscli
 python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
 ```
 
-(install-vllm)=
-
-### Step 3. Install vLLM from source
+#### Install vLLM from source
 
 Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
 
@@ -130,3 +117,19 @@ VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
 If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Neuron images.
+
+### Build image from source
+
+See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+
+Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+
+## Extra information
+
+There is no extra information for this device.
diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
similarity index 69%
rename from docs/source/getting_started/installation/openvino.md
rename to docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index d97d4173bf36b..a7867472583d6 100644
--- a/docs/source/getting_started/installation/openvino.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -1,63 +1,65 @@
-(installation-openvino)=
+# Installation
 
-# Installation for OpenVINO
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
 
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+## Requirements
 
-- Prefix caching (`--enable-prefix-caching`)
-- Chunked prefill (`--enable-chunked-prefill`)
+- OS: Linux
+- Instruction set architecture (ISA) requirement: at least AVX2.
 
-**Table of contents**:
+## Set up using Python
 
-- [Requirements](#openvino-backend-requirements)
-- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
-- [Build from source](#install-openvino-backend-from-source)
-- [Performance tips](#openvino-backend-performance-tips)
-- [Limitations](#openvino-backend-limitations)
+### Pre-built wheels
 
-(openvino-backend-requirements)=
+Currently, there are no pre-built OpenVINO wheels.
 
-## Requirements
+### Build wheel from source
 
-- OS: Linux
-- Instruction set architecture (ISA) requirement: at least AVX2.
+First, install Python. For example, on Ubuntu 22.04, you can run:
 
-(openvino-backend-quick-start-dockerfile)=
+```console
+sudo apt-get update  -y
+sudo apt-get install python3
+```
 
-## Quick start using Dockerfile
+Second, install prerequisites vLLM OpenVINO backend installation:
 
 ```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
+pip install --upgrade pip
+pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
-(install-openvino-backend-from-source)=
+Finally, install vLLM with OpenVINO backend:
 
-## Install from source
+```console
+PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+```
 
-- First, install Python. For example, on Ubuntu 22.04, you can run:
+:::{tip}
+To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+:::
 
-  ```console
-  sudo apt-get update  -y
-  sudo apt-get install python3
-  ```
+## Set up using Docker
 
-- Second, install prerequisites vLLM OpenVINO backend installation:
+### Pre-built images
 
-  ```console
-  pip install --upgrade pip
-  pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-  ```
+Currently, there are no pre-built OpenVINO images.
 
-- Finally, install vLLM with OpenVINO backend:
+### Build image from source
 
-  ```console
-  PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-  ```
+```console
+docker build -f Dockerfile.openvino -t vllm-openvino-env .
+docker run -it --rm vllm-openvino-env
+```
 
-- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+## Extra information
 
-(openvino-backend-performance-tips)=
+## Supported features
+
+OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
 
 ## Performance tips
 
@@ -95,8 +97,6 @@ $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
     python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-(openvino-backend-limitations)=
-
 ## Limitations
 
 - LoRA serving is not supported.
diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
similarity index 88%
rename from docs/source/getting_started/installation/tpu.md
rename to docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 1938785ade46a..6a911cc6b9eba 100644
--- a/docs/source/getting_started/installation/tpu.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -1,6 +1,4 @@
-(installation-tpu)=
-
-# Installation for TPUs
+# Installation
 
 Tensor Processing Units (TPUs) are Google's custom-developed application-specific
 integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
@@ -54,7 +52,16 @@ In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
 ```
 
-## Provision a Cloud TPU with the queued resource API
+### Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see:
+- <https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
+- <https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
+- <https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+
+## Configure a new environment
+
+### Provision a Cloud TPU with the queued resource API
 
 Create a TPU v5e with 4 TPU chips:
 
@@ -102,6 +109,14 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built TPU wheels.
+
+### Build wheel from source
+
 Install Miniconda:
 
 ```bash
@@ -142,16 +157,13 @@ Run the setup script:
 VLLM_TARGET_DEVICE="tpu" python setup.py develop
 ```
 
-## Provision Cloud TPUs with GKE
+## Set up using Docker
 
-For more information about using TPUs with GKE, see
-<https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
-<https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
-<https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+### Pre-built images
 
-(build-docker-tpu)=
+See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
 
-## Build a docker image with {code}`Dockerfile.tpu`
+### Build image from source
 
 You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
 
@@ -189,3 +201,7 @@ Install OpenBLAS with the following command:
 $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
 ````
+
+## Extra information
+
+There is no extra information for this device.
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
deleted file mode 100644
index e199073ed721f..0000000000000
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ /dev/null
@@ -1,46 +0,0 @@
-(installation-arm)=
-
-# Installation for ARM CPUs
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
-
-- CPU backend inference capabilities
-- Relevant runtime environment variables
-- Performance optimization tips
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-Contents:
-
-1. [Requirements](#arm-backend-requirements)
-2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
-3. [Building from Source](#build-arm-backend-from-source)
-
-(arm-backend-requirements)=
-
-## Requirements
-
-- **Operating System**: Linux or macOS
-- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
-- **Instruction Set Architecture (ISA)**: NEON support is required
-
-(arm-backend-quick-start-dockerfile)=
-
-## Quick Start with Dockerfile
-
-You can quickly set up vLLM on ARM using Docker:
-
-```console
-$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
-```
-
-(build-arm-backend-from-source)=
-
-## Building from Source
-
-To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu/apple.inc.md
similarity index 70%
rename from docs/source/getting_started/installation/cpu-apple.md
rename to docs/source/getting_started/installation/cpu/apple.inc.md
index 1068893f5bafa..56545253b1ef7 100644
--- a/docs/source/getting_started/installation/cpu-apple.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -1,20 +1,20 @@
-(installation-apple)=
+# Installation
 
-# Installation for macOS
-
-vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
 ## Requirements
 
-- **Operating System**: `macOS Sonoma` or later
-- **SDK** `XCode 15.4` or later with Command Line Tools
-- **Compilers**: `Apple Clang >= 15.0.0`
+- OS: `macOS Sonoma` or later
+- SDK: `XCode 15.4` or later with Command Line Tools
+- Compiler: `Apple Clang >= 15.0.0`
+
+## Set up using Python
 
-<!-- (arm-backend-quick-start-dockerfile)= -->
+### Pre-built wheels
 
-## Build and installation
+### Build wheel from source
 
 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
 
@@ -29,7 +29,7 @@ pip install -e .
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
 ```
 
-## Troubleshooting
+#### Troubleshooting
 
 If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
 [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
@@ -46,3 +46,11 @@ If the build has error like the following snippet where standard C++ headers can
             |          ^~~~~~~~~
       1 error generated.
 ```
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
new file mode 100644
index 0000000000000..08a764e1a25f4
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -0,0 +1,30 @@
+# Installation
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+
+## Requirements
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): NEON support is required
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+:::{include} build.inc.md
+:::
+
+Testing has been conducted on AWS Graviton3 instances for compatibility.
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
new file mode 100644
index 0000000000000..f8d1044a0d198
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -0,0 +1,21 @@
+First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```console
+sudo apt-get update  -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Second, install Python packages for vLLM CPU backend building:
+
+```console
+pip install --upgrade pip
+pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+Finally, build and install vLLM CPU backend:
+
+```console
+VLLM_TARGET_DEVICE=cpu python setup.py install
+```
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu/index.md
similarity index 67%
rename from docs/source/getting_started/installation/cpu-x86.md
rename to docs/source/getting_started/installation/cpu/index.md
index c49c8e0f2a18c..4ec907c0e9fda 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -1,91 +1,165 @@
-(installation-x86)=
+# CPU
 
-# Installation for x86 CPUs
+vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
+::::{tab-set}
+:sync-group: device
 
-- Tensor Parallel
-- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
-- Chunked-prefill
-- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
+:::{tab-item} x86
+:sync: x86
+
+```{include} x86.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
 
-Table of contents:
+:::
+
+:::{tab-item} ARM
+:sync: arm
+
+```{include} arm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Apple silicon
+:sync: apple
+
+```{include} apple.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
 
-1. [Requirements](#cpu-backend-requirements)
-2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
-3. [Build from source](#build-cpu-backend-from-source)
-4. [Related runtime environment variables](#env-intro)
-5. [Intel Extension for PyTorch](#ipex-guidance)
-6. [Performance tips](#cpu-backend-performance-tips)
+:::
 
-(cpu-backend-requirements)=
+::::
 
 ## Requirements
 
-- OS: Linux
-- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
-- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
+- Python: 3.9 -- 3.12
 
-(cpu-backend-quick-start-dockerfile)=
+::::{tab-set}
+:sync-group: device
 
-## Quick start using Dockerfile
+:::{tab-item} x86
+:sync: x86
 
-```console
-docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-docker run -it \
-           --rm \
-           --network=host \
-           --cpuset-cpus=<cpu-id-list, optional> \
-           --cpuset-mems=<memory-node, optional> \
-           vllm-cpu-env
+```{include} x86.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
 ```
 
-(build-cpu-backend-from-source)=
+:::
+
+:::{tab-item} ARM
+:sync: arm
 
-## Build from source
+```{include} arm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
 
-- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+:::
 
-```console
-sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+:::{tab-item} Apple silicon
+:sync: apple
+
+```{include} apple.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
 ```
 
-- Second, install Python packages for vLLM CPU backend building:
+:::
 
-```console
-pip install --upgrade pip
-pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+::::
+
+## Set up using Python
+
+### Create a new Python environment
+
+```{include} ../python_env_setup.inc.md
 ```
 
-- Finally, build and install vLLM CPU backend:
+### Pre-built wheels
 
-```console
-VLLM_TARGET_DEVICE=cpu python setup.py install
+Currently, there are no pre-built CPU wheels.
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} x86
+:sync: x86
+
+```{include} x86.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
 ```
 
-```{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
+:::
+
+:::{tab-item} ARM
+:sync: arm
+
+```{include} arm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
 ```
 
-(env-intro)=
+:::
 
-## Related runtime environment variables
+:::{tab-item} Apple silicon
+:sync: apple
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+```{include} apple.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-build CPU images.
+
+### Build image from source
 
-(ipex-guidance)=
+```console
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
 
-## Intel Extension for PyTorch
+:::{tip}
+For ARM or Apple silicon, use `Dockerfile.arm`
+:::
 
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+## Supported features
 
-(cpu-backend-performance-tips)=
+vLLM CPU backend supports the following vLLM features:
+
+- Tensor Parallel
+- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 
 ## Performance tips
 
@@ -137,13 +211,13 @@ $ python examples/offline_inference/basic.py
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
 
-## CPU Backend Considerations
+## Other considerations
 
 - The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
 
   - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
@@ -151,4 +225,4 @@ $ python examples/offline_inference/basic.py
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
new file mode 100644
index 0000000000000..e4f99d3cebdf2
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -0,0 +1,35 @@
+# Installation
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
+
+## Requirements
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+:::{include} build.inc.md
+:::
+
+```{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
+
+## Intel Extension for PyTorch
+
+- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/device.template.md b/docs/source/getting_started/installation/device.template.md
new file mode 100644
index 0000000000000..44f538da93659
--- /dev/null
+++ b/docs/source/getting_started/installation/device.template.md
@@ -0,0 +1,17 @@
+# Installation
+
+## Requirements
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
similarity index 84%
rename from docs/source/getting_started/installation/gpu-cuda.md
rename to docs/source/getting_started/installation/gpu/cuda.inc.md
index 727486abbd10f..4cce65278c069 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -1,44 +1,24 @@
-(installation-cuda)=
+# Installation
 
-# Installation for CUDA
-
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 
 ## Requirements
 
-- OS: Linux
-- Python: 3.9 -- 3.12
 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
-## Install released versions
+## Set up using Python
 
 ### Create a new Python environment
 
-You can create a new Python environment using `conda`:
-
-```console
-# (Recommended) Create a new conda environment.
-conda create -n myenv python=3.12 -y
-conda activate myenv
-```
-
 ```{note}
-[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
-
-Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
-
-```console
-# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-uv venv myenv --python 3.12 --seed
-source myenv/bin/activate
+PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
 ```
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
 Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
 
-### Install vLLM
+### Pre-built wheels
 
 You can install vLLM using either `pip` or `uv pip`:
 
@@ -59,11 +39,11 @@ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSI
 
 (install-the-latest-code)=
 
-## Install the latest code
+#### Install the latest code
 
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
-### Install the latest code using `pip`
+##### Install the latest code using `pip`
 
 ```console
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
@@ -80,7 +60,7 @@ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manyl
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
-### Install the latest code using `uv`
+##### Install the latest code using `uv`
 
 Another way to install the latest code is to use `uv`:
 
@@ -97,26 +77,9 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
-### Install the latest code using `docker`
-
-Another way to access the latest code is to use the docker images:
-
-```console
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
-```
-
-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
-
-The latest code can contain bugs and may not be stable. Please use it with caution.
-
-(build-from-source)=
+### Build wheel from source
 
-## Build from source
-
-(python-only-build)=
-
-### Python-only build (without compilation)
+#### Set up using Python-only build (without compilation)
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
 
@@ -135,14 +98,14 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4
 pip install --editable .
 ```
 
-You can find more information about vLLM's wheels [above](#install-the-latest-code).
+You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
 
 ```{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
+It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
 ```
 
-### Full build (with compilation)
+#### Full build (with compilation)
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
@@ -162,7 +125,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 ```
 
-#### Use an existing PyTorch installation
+##### Use an existing PyTorch installation
 
 There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
 
@@ -179,7 +142,7 @@ pip install -r requirements-build.txt
 pip install -e . --no-build-isolation
 ```
 
-#### Use the local cutlass for compilation
+##### Use the local cutlass for compilation
 
 Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
 To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
@@ -190,7 +153,7 @@ cd vllm
 VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
 ```
 
-#### Troubleshooting
+##### Troubleshooting
 
 To avoid your system being overloaded, you can limit the number of compilation jobs
 to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
@@ -224,7 +187,7 @@ nvcc --version # verify that nvcc is in your PATH
 ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
 ```
 
-### Unsupported OS build
+#### Unsupported OS build
 
 vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 
@@ -234,3 +197,28 @@ Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 export VLLM_TARGET_DEVICE=empty
 pip install -e .
 ```
+
+## Set up using Docker
+
+### Pre-built images
+
+See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image.
+
+Another way to access the latest code is to use the docker images:
+
+```console
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+### Build image from source
+
+See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+
+## Supported features
+
+See <project:#feature-x-hardware> compatibility matrix for feature support information.
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
new file mode 100644
index 0000000000000..6c007382b2c3d
--- /dev/null
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -0,0 +1,300 @@
+# GPU
+
+vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+::::
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+::::
+
+## Set up using Python
+
+### Create a new Python environment
+
+```{include} ../python_env_setup.inc.md
+```
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Create a new Python environment"
+:end-before: "### Pre-built wheels"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+There is no extra information on creating a new Python environment for this device.
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+There is no extra information on creating a new Python environment for this device.
+
+:::
+
+::::
+
+### Pre-built wheels
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+::::
+
+(build-from-source)=
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+::::
+
+### Build image from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+::::
+
+## Supported features
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+::::
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
similarity index 87%
rename from docs/source/getting_started/installation/gpu-rocm.md
rename to docs/source/getting_started/installation/gpu/rocm.inc.md
index a8971bb96248c..f6f9d3c303f89 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -1,82 +1,19 @@
-(installation-rocm)=
-
-# Installation for ROCm
+# Installation
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
 ## Requirements
 
-- OS: Linux
-- Python: 3.9 -- 3.12
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
 - ROCm 6.2
 
-Installation options:
-
-1. [Build from source with docker](#build-from-source-docker-rocm)
-2. [Build from source](#build-from-source-rocm)
-
-(build-from-source-docker-rocm)=
-
-## Option 1: Build from source with docker (recommended)
-
-You can build and install vLLM from source.
-
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
-
-```console
-{
-    "features": {
-        "buildkit": true
-    }
-}
-```
-
-<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
-It provides flexibility to customize the build of docker image using the following arguments:
-
-- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
-- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
-- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
-- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
-
-Their values can be passed in when running `docker build` with `--build-arg` options.
-
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
-
-```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
-```
+## Set up using Python
 
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+### Pre-built wheels
 
-```console
-DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-```
+Currently, there are no pre-built ROCm wheels.
 
-To run the above docker image `vllm-rocm`, use the below command:
-
-```console
-$ docker run -it \
-   --network=host \
-   --group-add=video \
-   --ipc=host \
-   --cap-add=SYS_PTRACE \
-   --security-opt seccomp=unconfined \
-   --device /dev/kfd \
-   --device /dev/dri \
-   -v <path/to/model>:/app/model \
-   vllm-rocm \
-   bash
-```
-
-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
-
-(build-from-source-rocm)=
-
-## Option 2: Build from source
+### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
@@ -157,7 +94,73 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
     ```
 
-    ```{tip}
-    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-    ```
+```{tip}
+- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built ROCm images.
+
+### Build image from source
+
+Building the Docker image from source is the recommended way to use vLLM with ROCm.
+
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
+- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
+- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
+- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+
+```console
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+
+```console
+DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+```console
+docker run -it \
+   --network=host \
+   --group-add=video \
+   --ipc=host \
+   --cap-add=SYS_PTRACE \
+   --security-opt seccomp=unconfined \
+   --device /dev/kfd \
+   --device /dev/dri \
+   -v <path/to/model>:/app/model \
+   vllm-rocm \
+   bash
+```
+
+Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+## Supported features
+
+See <project:#feature-x-hardware> compatibility matrix for feature support information.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
similarity index 80%
rename from docs/source/getting_started/installation/xpu.md
rename to docs/source/getting_started/installation/gpu/xpu.inc.md
index 73758f37cf0f6..577986eba74fd 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -1,40 +1,19 @@
-(installation-xpu)=
-
-# Installation for XPUs
+# Installation
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
-Table of contents:
-
-1. [Requirements](#xpu-backend-requirements)
-2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile)
-3. [Build from source](#build-xpu-backend-from-source)
-
-(xpu-backend-requirements)=
-
 ## Requirements
 
-- OS: Linux
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
 - OneAPI requirements: oneAPI 2024.2
 
-(xpu-backend-quick-start-dockerfile)=
+## Set up using Python
 
-## Quick start using Dockerfile
-
-```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --device /dev/dri \
-             -v /dev/dri/by-path:/dev/dri/by-path \
-             vllm-xpu-env
-```
+### Pre-built wheels
 
-(build-xpu-backend-from-source)=
+Currently, there are no pre-built XPU wheels.
 
-## Build from source
+### Build wheel from source
 
 - First, install required driver and intel OneAPI 2024.2 or later.
 - Second, install Python packages for vLLM XPU backend building:
@@ -56,7 +35,25 @@ VLLM_TARGET_DEVICE=xpu python setup.py install
   type will be supported in the future.
 ```
 
-## Distributed inference and serving
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built XPU images.
+
+### Build image from source
+
+```console
+$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --device /dev/dri \
+             -v /dev/dri/by-path:/dev/dri/by-path \
+             vllm-xpu-env
+```
+
+## Supported features
 
 XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 0ebadca2ccec9..bc1d268bf0c7e 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -7,14 +7,7 @@ vLLM supports the following hardware platforms:
 ```{toctree}
 :maxdepth: 1
 
-gpu-cuda
-gpu-rocm
-cpu-x86
-cpu-arm
-cpu-apple
-hpu-gaudi
-tpu
-xpu
-openvino
-neuron
+gpu/index
+cpu/index
+ai_accelerator/index
 ```
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
new file mode 100644
index 0000000000000..25cfac5f58aa7
--- /dev/null
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -0,0 +1,19 @@
+You can create a new Python environment using `conda`:
+
+```console
+# (Recommended) Create a new conda environment.
+conda create -n myenv python=3.12 -y
+conda activate myenv
+```
+
+```{note}
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
+```
+
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
+```

From 458e63a2c6b18e7febfa30cecb59461f96fbe324 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 13 Jan 2025 20:59:09 +0800
Subject: [PATCH 108/115] [platform] add device_control env var (#12009)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/platforms/cuda.py      |  1 +
 vllm/platforms/hpu.py       |  1 +
 vllm/platforms/interface.py | 10 ++++++++++
 vllm/platforms/neuron.py    |  1 +
 vllm/platforms/rocm.py      |  2 ++
 vllm/platforms/tpu.py       |  1 +
 vllm/platforms/xpu.py       |  1 +
 7 files changed, 17 insertions(+)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3f77ec50ed31f..b6a6c461369f9 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -78,6 +78,7 @@ class CudaPlatformBase(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     @classmethod
     def get_device_capability(cls,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0acb2804a5f66..b579ebf494bdc 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -20,6 +20,7 @@ class HpuPlatform(Platform):
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
     ray_device_key: str = "HPU"
+    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ec917f75689dd..afa9daa9c98a7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -78,20 +78,30 @@ class Platform:
     _enum: PlatformEnum
     device_name: str
     device_type: str
+
     # available dispatch keys:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+
     # available ray device keys:
     # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
     # empty string means the device does not support ray
     ray_device_key: str = ""
+
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+
     # The torch.compile backend for compiling simple and
     # standalone functions. The default value is "inductor" to keep
     # the same behavior as PyTorch.
     # NOTE: for the forward part of the model, vLLM has another separate
     # compilation strategy.
     simple_compile_backend: str = "inductor"
+
     supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 7f4a867b32ba1..0696f73cc17b4 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -18,6 +18,7 @@ class NeuronPlatform(Platform):
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
     supported_quantization: list[str] = ["neuron_quant"]
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f12e948113723..7f1e8aef528a6 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -65,6 +65,8 @@ class RocmPlatform(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    # rocm shares the same device control env var as CUDA
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 460eb170bba34..ff9487daac7a7 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -20,6 +20,7 @@ class TpuPlatform(Platform):
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
     ray_device_key: str = "TPU"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
 
     supported_quantization: list[str] = [
         "tpu_int8", "compressed-tensors", "compressed_tensors"
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index cb74f79b31794..f34376b44e689 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -22,6 +22,7 @@ class XPUPlatform(Platform):
     # Intel XPU's device key is "GPU" for Ray.
     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
     ray_device_key: str = "GPU"
+    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,

From a7d59688fb75827db4316c24a057ac6097114bd3 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 13 Jan 2025 21:12:10 +0800
Subject: [PATCH 109/115] [Platform] Move get_punica_wrapper() function to
 Platform (#11516)

Signed-off-by: Shanshan Shen <467638484@qq.com>
---
 vllm/lora/punica_wrapper/punica_selector.py | 26 +++++++--------------
 vllm/platforms/cpu.py                       |  4 ++++
 vllm/platforms/cuda.py                      |  4 ++++
 vllm/platforms/hpu.py                       |  4 ++++
 vllm/platforms/interface.py                 |  7 ++++++
 vllm/platforms/rocm.py                      |  4 ++++
 6 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 9f1606e672dea..a293224651992 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,5 +1,6 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import resolve_obj_by_qualname
 
 from .punica_base import PunicaWrapperBase
 
@@ -7,20 +8,11 @@
 
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
-    if current_platform.is_cuda_alike():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        logger.info_once("Using PunicaWrapperGPU.")
-        return PunicaWrapperGPU(*args, **kwargs)
-    elif current_platform.is_cpu():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
-        logger.info_once("Using PunicaWrapperCPU.")
-        return PunicaWrapperCPU(*args, **kwargs)
-    elif current_platform.is_hpu():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        logger.info_once("Using PunicaWrapperHPU.")
-        return PunicaWrapperHPU(*args, **kwargs)
-    else:
-        raise NotImplementedError
+    punica_wrapper_qualname = current_platform.get_punica_wrapper()
+    punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
+    punica_wrapper = punica_wrapper_cls(*args, **kwargs)
+    assert punica_wrapper is not None, \
+        "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
+    logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
+                     ".")
+    return punica_wrapper
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index eb3e269cac285..4d3b84fea887f 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -109,3 +109,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on CPU.")
         return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b6a6c461369f9..80cefcb492531 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -218,6 +218,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         logger.info("Using Flash Attention backend.")
         return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
 
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index b579ebf494bdc..242c2c127979a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -63,3 +63,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
         return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index afa9daa9c98a7..3c2ec9636df91 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -276,6 +276,13 @@ def is_pin_memory_available(cls) -> bool:
             return False
         return True
 
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 7f1e8aef528a6..43105d7855e79 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -153,3 +153,7 @@ def verify_quantization(cls, quant: str) -> None:
                 "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
                 " is not set, enabling VLLM_USE_TRITON_AWQ.")
         envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"

From c6db21313cef3f1f9aa5714efba0d60b5f1a8dad Mon Sep 17 00:00:00 2001
From: elijah <30852919+e1ijah1@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:22:07 +0800
Subject: [PATCH 110/115] bugfix: Fix signature mismatch in benchmark's
 `get_tokenizer` function (#11982)

Signed-off-by: elijah <f1renze.142857@gmail.com>
---
 benchmarks/backend_request_func.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index b67849038cf0d..9d71e4ecc4a37 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -417,14 +417,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:
 
 
 def get_tokenizer(
-    pretrained_model_name_or_path: str, trust_remote_code: bool
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
     if pretrained_model_name_or_path is not None and not os.path.exists(
             pretrained_model_name_or_path):
         pretrained_model_name_or_path = get_model(
             pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                         trust_remote_code=trust_remote_code)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError("MistralTokenizer requires vllm package.\n"
+                              "Please install it with `pip install vllm` "
+                              "to use mistral tokenizer mode.") from e
+        return MistralTokenizer.from_pretrained(
+            str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
 
 
 ASYNC_REQUEST_FUNCS = {

From 5a51290a5713dd66a0323386bf56c26d85d296f2 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Mon, 13 Jan 2025 20:43:25 +0000
Subject: [PATCH 111/115] Using list

---
 vllm/attention/backends/abstract.py           |  2 +-
 vllm/attention/backends/blocksparse_attn.py   |  2 +-
 vllm/attention/backends/flash_attn.py         |  2 +-
 vllm/attention/backends/flashinfer.py         |  2 +-
 vllm/attention/backends/hpu_attn.py           |  2 +-
 vllm/attention/backends/ipex_attn.py          |  2 +-
 vllm/attention/backends/pallas.py             |  2 +-
 vllm/attention/backends/rocm_flash_attn.py    |  6 +++---
 vllm/attention/backends/torch_sdpa.py         |  2 +-
 vllm/attention/backends/xformers.py           |  2 +-
 vllm/attention/layer.py                       | 12 ++++++------
 vllm/model_executor/layers/fused_moe/layer.py |  1 -
 vllm/model_executor/models/llama.py           |  4 ++--
 13 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f09903a6cad49..c4430a0651f7d 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -252,6 +252,6 @@ def forward(
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 73b04746ea113..70cfaa447c559 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -369,7 +369,7 @@ def forward(
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 02dc926c07d3c..5966cfea4c809 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -643,7 +643,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index cf7016b9996dc..17f7e864053f5 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -783,7 +783,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
 
         # TODO: directly write to output tensor
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 959379c7a9a38..e9fafbb4d0941 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -160,7 +160,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 9ef820d72b893..b1b5c9ee22a39 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -179,7 +179,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index ad9e3f7fcb42a..9fd4962a066f2 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -158,7 +158,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index cf311d705ece5..4fd49f194e949 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -551,7 +551,7 @@ def forward(
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -601,8 +601,8 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        q_scale, prob_scale, fp8_out_scale = fp8_comp_scales or (None, None,
-                                                                 None)
+        q_scale, prob_scale, fp8_out_scale = fp8_comp_scales or [None, None,
+                                                                 None]
 
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index fad90232e9f84..6dd829a30fa66 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -438,7 +438,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 34cd949a3c38c..f240d09711ab8 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -422,7 +422,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 4061846beefd7..b0db790c36a0a 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -146,7 +146,7 @@ def forward(
         value: torch.Tensor,
         _kv_cache: torch.Tensor,
         _attn_metadata: AttentionMetadata,
-        fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]] = None,
+        fp8_comp_scales: List[Optional[torch.Tensor]] = [],
     ) -> torch.Tensor:
         if self.calculate_kv_scales and \
             _attn_metadata.enable_kv_scales_calculation:
@@ -262,7 +262,7 @@ def unified_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]],
+    fp8_comp_scales: List[Optional[torch.Tensor]],
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
@@ -283,7 +283,7 @@ def unified_attention_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]],
+    fp8_comp_scales: List[Optional[torch.Tensor]],
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
 
@@ -303,9 +303,9 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]],
+    fp8_comp_scales: List[Optional[torch.Tensor]],
 ) -> None:
-    assert fp8_comp_scales is None
+    assert not fp8_comp_scales
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
@@ -326,7 +326,7 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: Optional[Tuple[torch.Tensor, ...]],
+    fp8_comp_scales: List[Optional[torch.Tensor]],
 ) -> None:
     return
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 83e310066b470..811f52d72aeb5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn.functional as F
-from torch.nn.modules import Module
 
 import vllm.envs as envs
 from vllm.distributed import (get_tensor_model_parallel_rank,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index be221b659f316..a5b516eac4c8a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -230,10 +230,10 @@ def forward(
                                 v,
                                 kv_cache,
                                 attn_metadata,
-                                fp8_comp_scales=(self.attn._q_scale,
+                                fp8_comp_scales=[self.attn._q_scale,
                                                  self.attn._prob_scale,
                                                  self.o_proj.input_scale if
-                                                 self.attn_fp8_out else None))
+                                                 self.attn_fp8_out else None])
         output, _ = self.o_proj(attn_output)
         return output
 

From 079750e3ab84382823215ad02f8f793e1de83d80 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Mon, 13 Jan 2025 21:15:00 +0000
Subject: [PATCH 112/115] Revert "[misc] improve memory profiling (#11809)"

This reverts commit 889e662eae19fe8f30469883c6854ee4df4315a9.
---
 tests/test_utils.py                           | 19 +-----
 .../vllm_test_utils/__init__.py               |  3 +-
 .../vllm_test_utils/monitor.py                | 68 -------------------
 vllm/utils.py                                 | 12 ++--
 4 files changed, 8 insertions(+), 94 deletions(-)
 delete mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 639c9a21babb7..93c9a82f8529f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from vllm_test_utils import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
@@ -309,16 +308,8 @@ def test_memory_profiling():
 
     weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
 
-    def measure_current_non_torch():
-        free, total = torch.cuda.mem_get_info()
-        current_used = total - free
-        current_torch = torch.cuda.memory_reserved()
-        current_non_torch = current_used - current_torch
-        return current_non_torch
-
     with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
-        monitor(measure_current_non_torch) as monitored_values:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
         del spike
@@ -326,15 +317,7 @@ def measure_current_non_torch():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
-    # this is an analytic value, it is exact,
-    # we only have 256 MiB non-torch memory increase
-    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
-    assert measured_diff == 256 * 1024 * 1024
-
     # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by PyTorch caching allocator,
-    # we cannot control PyTorch's behavior of its internal buffers,
-    # which causes a small error (<10 MiB in practice)
     non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
     torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 6505c81546bb0..bf0b62a5b75e3 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,6 +4,5 @@
 """
 
 from .blame import BlameResult, blame
-from .monitor import MonitoredValues, monitor
 
-__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
+__all__ = ["blame", "BlameResult"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
deleted file mode 100644
index a237f53a75d18..0000000000000
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import contextlib
-import dataclasses
-import sys
-import traceback
-from typing import Callable, Generator, Generic, TypeVar
-
-_T = TypeVar("_T")
-
-
-@dataclasses.dataclass
-class MonitoredValues(Generic[_T]):
-    values: list[_T] = dataclasses.field(default_factory=list)
-    trace_stacks: list[str] = dataclasses.field(default_factory=list)
-
-
-@contextlib.contextmanager
-def monitor(
-    measure_func: Callable[[],
-                           _T]) -> Generator[MonitoredValues[_T], None, None]:
-    """
-    Trace the function calls to continuously monitor the change of
-    a value.
-
-    Usage:
-
-    ```python
-
-    def measure_func():
-        ... # measure the current value
-        return current_value
-
-    with monitor(measure_func) as monitored_values:
-        # do something
-    
-        monitored_values.values # all changes of the values
-        monitored_values.trace_stacks # trace stacks of every change
-    ```
-    """
-    monitored_values = MonitoredValues[_T]()
-
-    def _trace_calls(frame, event, arg=None):
-        nonlocal monitored_values
-        if event in ['line']:
-            # triggered by every line of Python code.
-            # only Python functions will trigger it,
-            # c/cpp functions will not trigger it.
-            try:
-                # Temporarily disable the trace function
-                sys.settrace(None)
-                # do a measurement
-                current_value = measure_func()
-                if len(monitored_values.values
-                       ) == 0 or current_value != monitored_values.values[-1]:
-                    monitored_values.values.append(current_value)
-                    monitored_values.trace_stacks.append("".join(
-                        traceback.format_stack()))
-                # Re-enable the trace function
-                sys.settrace(_trace_calls)
-            except NameError:
-                # modules are deleted during shutdown
-                pass
-        return _trace_calls
-
-    try:
-        sys.settrace(_trace_calls)
-        yield monitored_values
-    finally:
-        sys.settrace(None)
diff --git a/vllm/utils.py b/vllm/utils.py
index 3669c4d96224e..ad02c68d16276 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2073,10 +2073,10 @@ class MemorySnapshot:
     timestamp: float = 0.0
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
-        # torch.cuda.memory_reserved() is how many bytes
-        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
-        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
+        self.torch_peak_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
+        self.torch_memory_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -2153,10 +2153,10 @@ def memory_profiling(
 
     The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
 
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
 
     (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
     """ # noqa
     torch.cuda.reset_peak_memory_stats()
 

From 113274a0e740ac779ac713de643e12856b10ce05 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:00:11 -0600
Subject: [PATCH 113/115] Multi-lingual P3L (#356)

* Commiting the *multilingual* P3L test.

* Created a *multi-lingual* P3L test.

* Making ruff happy.

* .

* Added a reference to the language-scripture Confluence table.

* Typo fixing.

* Harmonizing naming.

* Fixing comments in the header.

---------

Co-authored-by: Alexei V. Ivanov <alivanov@banff-cyxtera-s65-4.amd.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
---
 benchmarks/P3L_mling.py | 250 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100755 benchmarks/P3L_mling.py

diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
new file mode 100755
index 0000000000000..0606cc0738068
--- /dev/null
+++ b/benchmarks/P3L_mling.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+*MULTILINGUAL*  Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement 
+for the quantized KV cache system by processing a sequence of 
+non-overlapping patches of the reference text. Generation of the 
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter 
+"--context-size".
+
+The number of output symbols to generate starting from a given 
+context is set by the parameter "--sample-size". This variable also 
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the 
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running DeepSeek-V2 model 
+( 
+    ./vllm/examples/P3L_mling.py 
+    --model=meta-llama/Llama-2-7b-chat-hf 
+    --context-size=1024 
+    --sample-size=512
+)
+
+should result in PPL ~ 8.42927
+
+Running DeepSeek-V2 model 
+( 
+    ./vllm/examples/P3L_mling.py 
+    --model=meta-llama/Llama-2-7b-chat-hf 
+    --context-size=1024 
+    --sample-size=512
+    --patch-size=1
+    --lang-script="cmn_Hant"
+)
+should result in PPL ~ 2.67962
+
+The multi-linguality is implemented through the additional
+key "--lang-script", which defaults to English in Latin 
+scripture ("eng_Latn").
+
+Please refer to 
+
+https://confluence.amd.com/display/MLSE/Multi-Lingual+P3L+Test
+
+for the complete set of possible language-scripture choices.
+
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+
+import pandas
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+    hf_hub_download(repo_id='alexei-v-ivanov-amd/wiki',
+                    repo_type="dataset",
+                    filename='wiki.test.raw',
+                    local_dir='./')
+    with open('./wiki.test.raw') as f:
+        test_text = "\n".join(line.strip() for line in f)
+        test_enc = tokenizer(test_text)
+
+    os.remove('./wiki.test.raw')
+
+    return test_enc, test_text
+
+
+def get_flores_plus_text(tokenizer, lng_scrpt):
+    hf_hub_download(repo_id='alexei-v-ivanov-amd/flores_plus',
+                    repo_type="dataset",
+                    filename=lng_scrpt + '.parquet',
+                    local_dir='./')
+
+    df = pandas.read_parquet('./' + lng_scrpt + '.parquet')
+    test_text = "\n\n".join(line.strip() for line in df['text'])
+    test_enc = tokenizer(test_text)
+
+    os.remove('./' + lng_scrpt + '.parquet')
+
+    return test_enc, test_text
+
+
+def vllm_init(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(n=1,
+                                     temperature=0.0,
+                                     top_p=1,
+                                     ignore_eos=True,
+                                     ppl_measurement=True,
+                                     future_context=[],
+                                     prompt_logprobs=1,
+                                     logprobs=1,
+                                     presence_penalty=0.0)
+
+    return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+    result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+    return result
+
+
+def main(args: argparse.Namespace):
+
+    MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    my_ppl = 0.0
+
+    logger.info("Initializing the engine.")
+    my_llm, my_sampl_par = vllm_init(args)
+    my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+    logger.info(my_sampl_par)
+    logger.info("Initialized the engine.")
+
+    my_n_samples = args.sample_size
+    my_lang_script = args.lang_script
+
+    if (args.context_size+my_n_samples) > \
+        my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = ("" \
+            "Error! The total number of tokens:\n" \
+            f" prefix ({args.context_size}) + " \
+            f"to be generated ({my_n_samples})" \
+            f" can't be bigger than the model limit " \
+            f"({my_llm.llm_engine.model_config.max_model_len}).")
+        logger.info(MESSAGE)
+        print(MESSAGE)
+        return
+
+    my_test_enc, my_test_text = get_flores_plus_text(my_tokenizer,
+                                                     my_lang_script)
+
+    logger.info("Loaded the test data.")
+
+    my_n_patches = math.ceil(
+        (len(my_test_enc['input_ids']) - args.context_size - 1) / my_n_samples)
+    if args.patch_size is not None:
+        my_n_patches = args.patch_size
+
+    num_tokens_generated = 0
+    starting_time = datetime.datetime.now()
+    MESSAGE = (f"Starting generation @ {starting_time}\n" \
+                " Have the test sample of "
+                f"{len(my_test_enc['input_ids'])} tokens" \
+                f" will try to process {my_n_patches} patche(s)," \
+                f" generating {my_n_samples} tokens in each patch" \
+                f" from the initial context of {args.context_size} tokens.")
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    for c in range(my_n_patches):
+        CONTEXT = []
+        my_sampl_par.future_context = []
+        CONTEXT.append(
+            my_test_enc['input_ids'][c * my_n_samples:c * my_n_samples +
+                                     args.context_size])
+        upper_boundary = min((c + 1) * my_n_samples + args.context_size,
+                             len(my_test_enc['input_ids']))
+        my_sampl_par.future_context.append(
+            my_test_enc['input_ids'][c * my_n_samples +
+                                     args.context_size:upper_boundary])
+        my_sampl_par.max_tokens = len(my_sampl_par.future_context[0])
+        my_sampl_par.cntr = c
+        LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+        num_tokens_generated += len(LOGPROBS[0].outputs[0].token_ids)
+        if (num_tokens_generated < my_n_samples):
+            MESSAGE = (f"Warning: The number of generated tokens is" \
+                        f"less than requested ({num_tokens_generated}" \
+                        f" < {my_n_samples}).")
+            logger.info(MESSAGE)
+            print(MESSAGE)
+        my_ppl -= LOGPROBS[0].outputs[0].cumulative_logprob
+        MESSAGE = (f"Iteration {c+1} of {my_n_patches} Intermediate" \
+            "Estimates:\n" \
+            f"\tCross-entropy_intermediate={my_ppl/num_tokens_generated}\n" \
+            f"\tPerplexity_intermediate=" \
+            f"{math.exp(my_ppl/num_tokens_generated)}")
+
+        logger.info(MESSAGE)
+        print(MESSAGE)
+    ending_time = datetime.datetime.now()
+    MESSAGE = (f"Done @ {ending_time} after processing for" \
+                f" {ending_time-starting_time}" \
+                f" generated {num_tokens_generated} tokens.")
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    MESSAGE = (f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy=" \
+                f"{my_ppl/num_tokens_generated}" \
+                f"\n\tPPL={math.exp(my_ppl/num_tokens_generated)}")
+
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Measure the PPPL (P3L) score of a given model.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='./wikitext/wikitext-2-v1/test-00000-of-00001.parquet')
+    parser.add_argument('--context-size', type=int, default=4096)
+    parser.add_argument('--sample-size', type=int, default=512)
+    parser.add_argument('--patch-size', type=int, default=None)
+    parser.add_argument('--lang-script', type=str, default="eng_Latn")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the latency results in JSON format.')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    main(args)

From 043c93d50da96289a32fb7dbb40ec2b747e7be89 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Mon, 13 Jan 2025 23:42:47 +0000
Subject: [PATCH 114/115] Trying to make scales work with compileable attention

---
 vllm/attention/backends/abstract.py         |  4 ++-
 vllm/attention/backends/blocksparse_attn.py |  4 ++-
 vllm/attention/backends/flash_attn.py       |  4 ++-
 vllm/attention/backends/flashinfer.py       |  4 ++-
 vllm/attention/backends/hpu_attn.py         |  4 ++-
 vllm/attention/backends/ipex_attn.py        |  4 ++-
 vllm/attention/backends/pallas.py           |  4 ++-
 vllm/attention/backends/rocm_flash_attn.py  |  9 +++---
 vllm/attention/backends/torch_sdpa.py       |  4 ++-
 vllm/attention/backends/xformers.py         |  4 ++-
 vllm/attention/layer.py                     | 35 ++++++++++-----------
 vllm/model_executor/models/llama.py         | 12 ++-----
 12 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index c4430a0651f7d..901bdb2ff2013 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -251,7 +251,9 @@ def forward(
         attn_metadata: T,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 70cfaa447c559..7840c9f076778 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -368,8 +368,10 @@ def forward(
         attn_metadata: BlocksparseFlashAttentionMetadata,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 5966cfea4c809..35fb78db67daf 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -642,8 +642,10 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 17f7e864053f5..8342e43c1f7bf 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -782,8 +782,10 @@ def forward(
         attn_metadata: FlashInferMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
 
         # TODO: directly write to output tensor
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index e9fafbb4d0941..f19ee9540683f 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -159,8 +159,10 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index b1b5c9ee22a39..e9e8428e4d261 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -178,8 +178,10 @@ def forward(
         attn_metadata: IpexAttnMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 9fd4962a066f2..bac8c0b07d8fe 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -157,8 +157,10 @@ def forward(
         attn_metadata: PallasMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 4fd49f194e949..174539d3146b7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -550,8 +550,10 @@ def forward(
         attn_metadata: ROCmFlashAttentionMetadata,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -601,9 +603,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        q_scale, prob_scale, fp8_out_scale = fp8_comp_scales or [None, None,
-                                                                 None]
-
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
             assert value is not None
@@ -687,7 +686,7 @@ def forward(
                         1.0 / q_scale.item(), 1.0 / k_scale.item(),
                         1.0 / v_scale.item(), 1.0 / prob_scale.item(),
                         fp8_out_scale.item()) if (
-                            fp8_out_scale
+                            fp8_out_scale and q_scale and prob_scale
                             and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None
                     out, _ = self.attn_func(
                         query,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 6dd829a30fa66..1ae73d37ed891 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -437,8 +437,10 @@ def forward(
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index f240d09711ab8..d28738a6d6f1a 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -421,8 +421,10 @@ def forward(
         attn_metadata: "XFormersMetadata",
         k_scale: float = 1.0,
         v_scale: float = 1.0,
+        q_scale: Optional[torch.Tensor] = None,
+        prob_scale: Optional[torch.Tensor] = None,
+        fp8_out_scale: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b0db790c36a0a..7b612df21791c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,5 +1,5 @@
 """Attention layer."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import torch
 import torch.nn as nn
@@ -146,7 +146,7 @@ def forward(
         value: torch.Tensor,
         _kv_cache: torch.Tensor,
         _attn_metadata: AttentionMetadata,
-        fp8_comp_scales: List[Optional[torch.Tensor]] = [],
+        fp8_out_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if self.calculate_kv_scales and \
             _attn_metadata.enable_kv_scales_calculation:
@@ -165,18 +165,18 @@ def forward(
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
                 unified_attention_with_output(query, key, value, output,
-                                              self.layer_name, fp8_comp_scales)
+                                              self.layer_name, fp8_out_scale)
             else:
                 torch.ops.vllm.unified_attention_with_output(
-                    query, key, value, output, self.layer_name)
+                    query, key, value, output, self.layer_name, fp8_out_scale)
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
                 return unified_attention(query, key, value, self.layer_name,
-                                         fp8_comp_scales)
+                                         fp8_out_scale)
             else:
                 return torch.ops.vllm.unified_attention(
-                    query, key, value, self.layer_name, fp8_comp_scales)
+                    query, key, value, self.layer_name, fp8_out_scale)
 
     def calc_kv_scales(self, query, key, value):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
@@ -262,20 +262,15 @@ def unified_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: List[Optional[torch.Tensor]],
+    fp8_out_scale: Optional[torch.Tensor],
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(query,
-                             key,
-                             value,
-                             kv_cache,
-                             attn_metadata,
-                             self._k_scale,
-                             self._v_scale,
-                             fp8_comp_scales=fp8_comp_scales)
+    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+                             self._k_scale, self._v_scale, self._q_scale,
+                             self._prob_scale, fp8_out_scale)
 
 
 def unified_attention_fake(
@@ -283,7 +278,7 @@ def unified_attention_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: List[Optional[torch.Tensor]],
+    fp8_out_scale: Optional[torch.Tensor],
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
 
@@ -303,9 +298,8 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: List[Optional[torch.Tensor]],
+    fp8_out_scale: Optional[torch.Tensor],
 ) -> None:
-    assert not fp8_comp_scales
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
@@ -317,6 +311,9 @@ def unified_attention_with_output(
                       attn_metadata,
                       self._k_scale,
                       self._v_scale,
+                      self._q_scale,
+                      self._prob_scale,
+                      fp8_out_scale,
                       output=output)
 
 
@@ -326,7 +323,7 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    fp8_comp_scales: List[Optional[torch.Tensor]],
+    fp8_out_scale: Optional[torch.Tensor],
 ) -> None:
     return
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a5b516eac4c8a..d991fabc92b08 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -225,15 +225,9 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                fp8_comp_scales=[self.attn._q_scale,
-                                                 self.attn._prob_scale,
-                                                 self.o_proj.input_scale if
-                                                 self.attn_fp8_out else None])
+        attn_output = self.attn(
+            q, k, v, kv_cache, attn_metadata,
+            self.o_proj.input_scale if self.attn_fp8_out else None)
         output, _ = self.o_proj(attn_output)
         return output
 

From 16f8680d720e87adda8915e584df0c51d0129453 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Tue, 14 Jan 2025 16:15:15 +0000
Subject: [PATCH 115/115] Docs lint

---
 docs/dev-docker/README.md | 167 +++++++++++++++++++-------------------
 1 file changed, 85 insertions(+), 82 deletions(-)

diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md
index 11c0ef04fd8f7..c3496358c15d9 100644
--- a/docs/dev-docker/README.md
+++ b/docs/dev-docker/README.md
@@ -1,6 +1,6 @@
 # vllm FP8 Latency and Throughput benchmarks on AMD MI300x
 
-Documentation for vLLM Inferencing on AMD Instinct platforms. 
+Documentation for vLLM Inferencing on AMD Instinct platforms.
 
 ## Overview
 
@@ -10,11 +10,9 @@ This documentation shows some reference performance numbers and the steps to rep
 
 It includes:
 
-   -  ROCm™ 6.3
-
-   - vLLM 0.6.3
-
-   - PyTorch 2.6dev (nightly)
+- ROCm™ 6.3
+- vLLM 0.6.3
+- PyTorch 2.6dev (nightly)
 
 ## System configuration
 
@@ -39,16 +37,15 @@ The performance data below was measured on a server with MI300X accelerators wit
 | Power cap | 750 W |
 | SCLK/MCLK | 2100 Mhz / 1300 Mhz |
 
-## Pull latest 
+## Pull latest
 
 You can pull the image with `docker pull rocm/vllm-dev:main`
 
 ### What is New
 
-   - ROCm 6.3 support
-   - Potential bug with Tunable Ops not saving due to a PyTorch issue
-      
-     
+- ROCm 6.3 support
+- Potential bug with Tunable Ops not saving due to a PyTorch issue
+
 Gemms are tuned using PyTorch's Tunable Ops  feature (https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md)
 The  gemms are automatically enabled in the docker image, and all stored gemm configs are kept in /app/_gemm_csv in the same image
 
@@ -56,23 +53,24 @@ The  gemms are automatically enabled in the docker image, and all stored gemm co
 
 ### Use pre-quantized models
 
-To make it easier to run fp8 Llama 3.1 models on MI300X, the quantized checkpoints are available on AMD Huggingface space as follows 
+To make it easier to run fp8 Llama 3.1 models on MI300X, the quantized checkpoints are available on AMD Huggingface space as follows
 
-- https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV 
-- https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV 
-- https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-- https://huggingface.co/amd/grok-1-FP8-KV
+- <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/grok-1-FP8-KV>
 
-Currently these models are private. Please join https://huggingface.co/amd to access. 
+Currently these models are private. Please join <https://huggingface.co/amd> to access.
 
 Download the model you want to run.  
 
-These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to https://quark.docs.amd.com/latest/quark_example_torch_llm_gen.html
+These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to <https://quark.docs.amd.com/latest/quark_example_torch_llm_gen.html>
 
 ### Quantize your own models
-This step is optional for you to use quantized models on your own. Take Llama 3.1 405B as an example. 
 
-Download the Model View the Llama-3.1-405B model at https://huggingface.co/meta-llama/Llama-3.1-405B. Ensure that you have been granted access, and apply for it if you do not have access.
+This step is optional for you to use quantized models on your own. Take Llama 3.1 405B as an example.
+
+Download the Model View the Llama-3.1-405B model at <https://huggingface.co/meta-llama/Llama-3.1-405B>. Ensure that you have been granted access, and apply for it if you do not have access.
 
 If you do not already have a HuggingFace token, open your user profile (https://huggingface.co/settings/profile), select "Access Tokens", press "+ Create New Token", and create a new Read token.
 
@@ -100,27 +98,29 @@ Similarly, you can download Llama-3.1-70B and Llama-3.1-8B.
 
 Run the quantization script in the example folder using the following command line:
 export MODEL_DIR = [local model checkpoint folder] or meta-llama/Llama-3.1-405B-Instruct
+
 #### single GPU
-        python3 quantize_quark.py \ 
-        --model_dir $MODEL_DIR \
-        --output_dir Llama-3.1-405B-Instruct-FP8-KV \                           
-        --quant_scheme w_fp8_a_fp8 \
-        --kv_cache_dtype fp8 \
-        --num_calib_data 128 \
-        --model_export quark_safetensors \
-        --no_weight_matrix_merge
-
-#### If model size is too large for single GPU, please use multi GPU instead.
-        python3 quantize_quark.py \ 
-        --model_dir $MODEL_DIR \
-        --output_dir Llama-3.1-405B-Instruct-FP8-KV \                           
-        --quant_scheme w_fp8_a_fp8 \
-        --kv_cache_dtype fp8 \
-        --num_calib_data 128 \
-        --model_export quark_safetensors \
-        --no_weight_matrix_merge \
-        --multi_gpu
 
+    python3 quantize_quark.py \
+    --model_dir $MODEL_DIR \
+    --output_dir Llama-3.1-405B-Instruct-FP8-KV \                           
+    --quant_scheme w_fp8_a_fp8 \
+    --kv_cache_dtype fp8 \
+    --num_calib_data 128 \
+    --model_export quark_safetensors \
+    --no_weight_matrix_merge
+
+#### If model size is too large for single GPU, please use multi GPU instead
+
+    python3 quantize_quark.py \
+    --model_dir $MODEL_DIR \
+    --output_dir Llama-3.1-405B-Instruct-FP8-KV \                           
+    --quant_scheme w_fp8_a_fp8 \
+    --kv_cache_dtype fp8 \
+    --num_calib_data 128 \
+    --model_export quark_safetensors \
+    --no_weight_matrix_merge \
+    --multi_gpu
 
 ### Launch AMD vLLM Docker
 
@@ -135,7 +135,7 @@ Download and launch the docker,
 
 ### Benchmark with AMD vLLM Docker
 
-There are some system settings to be configured for optimum performance on MI300X. 
+There are some system settings to be configured for optimum performance on MI300X.
 
 #### NUMA balancing setting
 
@@ -160,15 +160,16 @@ Some environment variables enhance the performance of the vLLM kernels and PyTor
     export NCCL_MIN_NCHANNELS=112
     export VLLM_FP8_PADDING=1
 
-You can set both PYTORCH_TUNABLEOP_ENABLED and PYTORCH_TUNABLEOP_TUNING to 1 to performance GEMM tuning for the 1st benchmark run. 
-It will take some time to complete the tuning during the benchmark. After tuning, it will generate several csv files as the performance lookup database. For the subsequent benchmark runs, you can keep 
+You can set both PYTORCH_TUNABLEOP_ENABLED and PYTORCH_TUNABLEOP_TUNING to 1 to performance GEMM tuning for the 1st benchmark run.
+It will take some time to complete the tuning during the benchmark. After tuning, it will generate several csv files as the performance lookup database. For the subsequent benchmark runs, you can keep
 
-PYTORCH_TUNABLEOP_ENABLED as 1 and set 
-PYTORCH_TUNABLEOP_TUNING to 0 to use the selected kernels. 
+PYTORCH_TUNABLEOP_ENABLED as 1 and set
+PYTORCH_TUNABLEOP_TUNING to 0 to use the selected kernels.
 
 ##### vLLM engine performance settings
-vLLM provides a number of engine options which can be changed to improve performance. 
-Refer https://docs.vllm.ai/en/stable/models/engine_args.html for the complete list of vLLM engine options.
+
+vLLM provides a number of engine options which can be changed to improve performance.
+Refer <https://docs.vllm.ai/en/stable/models/engine_args.html> for the complete list of vLLM engine options.
 Below is a list of options which are useful:
 - **--max-model-len** : Maximum context length supported by the model instance. Can be set to a lower value than model configuration value to improve performance and gpu memory utilization.
 - **--max-num-batched-tokens** : The maximum prefill size, i.e., how many prompt tokens can be packed together in a single prefill. Set to a higher value to improve prefill performance at the cost of higher gpu memory utilization. 65536 works well for LLama models.
@@ -179,6 +180,7 @@ Below is a list of options which are useful:
 Note: vLLM's server creation command line (vllm serve) supports the above parameters as command line arguments.
   
 ##### Online Gemm Tuning
+
 Online Gemm tuning for small decode batch sizes can improve performance in some cases. e.g. Llama 70B upto Batch size 8
 
 If you want to do limited online tuning use --enforce-eager and tune for particular batch sizes. See example below.
@@ -239,8 +241,8 @@ If you want to run Meta-Llama-3.1-405B FP16, please run
     --input-len 128 \
     --output-len 128
 
-You can change various input-len, output-len, batch size and run the benchmark as well. When output-len is 1, it measures prefill latency (TTFT). 
-Decoding latency (TPOT) can be calculated based on the measured latency. 
+You can change various input-len, output-len, batch size and run the benchmark as well. When output-len is 1, it measures prefill latency (TTFT).
+Decoding latency (TPOT) can be calculated based on the measured latency.
 
 For more information about the parameters, please run
 
@@ -261,7 +263,7 @@ Benchmark Meta-Llama-3.1-405B FP8 with input 128 tokens, output 128 tokens and t
     --num-scheduler-steps 10 \
     --tensor-parallel-size 8 \
     --input-len 128 \
-    --output-len 128 
+    --output-len 128
 
 If you want to run Meta-Llama-3.1-405B FP16, please run
 
@@ -294,23 +296,23 @@ For more information about the parameters, please run
 
     /app/vllm/benchmarks/benchmark_throughput.py -h
 
-Tensor parallelism (TP) parameters depends on the model size. For Llama 3.1 70B and 8B model, TP 1 can be used as well for MI300X. In general, TP 8 and 1 is recommended to achieve the optimum performance. 
+Tensor parallelism (TP) parameters depends on the model size. For Llama 3.1 70B and 8B model, TP 1 can be used as well for MI300X. In general, TP 8 and 1 is recommended to achieve the optimum performance.
 
 ##### Online Server Benchmark
- 
+
 Make the following changes if required
- 
+
 /app/vllm/benchmarks/backend_request_func.py
- 
+
 line 242 + "ignore_eos": True,
- 
+
 /app/vllm/benchmarks/benchmark_serving.py
 line 245 -         interval = np.random.exponential(1.0 / request_rate)
 line 245 +         ## interval = np.random.exponential(1.0 / request_rate)
 line 246 +         interval = 1.0 / request_rate
- 
+
 Benchmark Meta-Llama-3.1-70B with input 4096 tokens, output 512 tokens and tensor parallelism 8 as an example,
- 
+
     vllm serve /data/llm/Meta-Llama-3.1-70B-Instruct-FP8-KV \
     --swap-space 16 \
     --disable-log-requests \
@@ -322,11 +324,11 @@ Benchmark Meta-Llama-3.1-70B with input 4096 tokens, output 512 tokens and tenso
     --max-num-batched-tokens 65536 \
     --gpu-memory-utilization 0.99 \
     --num_scheduler-steps 10
- 
+
 Change port (for example --port 8005) if port=8000 is currently being used by other processes.
- 
+
 run client in a separate terminal. Use port_id from previous step else port-id=8000.
- 
+
     python /app/vllm/benchmarks/benchmark_serving.py \
     --port 8000 \
     --model /data/llm/Meta-Llama-3.1-70B-Instruct-FP8-KV \
@@ -336,18 +338,18 @@ run client in a separate terminal. Use port_id from previous step else port-id=8
     --request-rate 1 \
     --num-prompts 500 \
     --percentile-metrics ttft,tpot,itl,e2el
- 
+
 Once all prompts are processed, terminate the server gracefully (ctrl+c).
- 
+
 ##### CPX mode
- 
+
 Currently only CPX-NPS1 mode is supported. So ONLY tp=1 is supported in CPX mode.
 But multiple instances can be started simultaneously (if needed) in CPX-NPS1 mode.
- 
+
 Set GPUs in CPX mode
- 
+
     rocm-smi --setcomputepartition cpx
- 
+
 Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512. As mentioned above, tp=1.
 
     HIP_VISIBLE_DEVICES=0 \
@@ -363,42 +365,43 @@ Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512.
     --output-json <path/to/output.json> \
     --quantization fp8 \
     --gpu-memory-utilization 0.99
- 
+
 Set GPU to SPX mode.
 
     rocm-smi --setcomputepartition spx
 
 ### Speculative Decoding
 
-Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model. 
+Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model.
 
-Without Speculative Decoding - 
+Without Speculative Decoding -
 
      python benchmark_latency.py --model /models/models--amd--Meta-Llama-3.1-405B-Instruct-FP8-KV/ --max-model-len 26720 -tp 8 --batch-size 1 --use-v2-block-manager --input-len 1024 --output-len 128
 
-With Speculative Decoding - 
+With Speculative Decoding -
 
      python benchmark_latency.py --model /models/models--amd--Meta-Llama-3.1-405B-Instruct-FP8-KV/ --max-model-len 26720 -tp 8 --batch-size 1 --use-v2-block-manager --input-len 1024 --output-len 128 --speculative-model /models/models--amd--Meta-Llama-3.1-8B-Instruct-FP8-KV/ --num-speculative-tokens 5
 
-You should see some performance improvement about the e2e latency. 
+You should see some performance improvement about the e2e latency.
 
 ### MMLU_PRO_Biology Accuracy Eval
- 
+
 ### fp16
+
 vllm (pretrained=models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 64
- 
+
 | Tasks |Version|    Filter    |n-shot|  Metric   |   |Value |   |Stderr|
 |-------|------:|--------------|-----:|-----------|---|-----:|---|-----:|
 |biology|      0|custom-extract|     5|exact_match|↑  |0.8466|±  |0.0135|
- 
+
 ### fp8
+
 vllm (pretrained=models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,quantization=fp8,quantized_weights_path=/llama.safetensors,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 32
- 
+
 | Tasks |Version|    Filter    |n-shot|  Metric   |   |Value|   |Stderr|
 |-------|------:|--------------|-----:|-----------|---|----:|---|-----:|
 |biology|      0|custom-extract|     5|exact_match|↑  |0.848|±  |0.0134|
 
-
 ## Performance
 
 ### LLaMA2/3 *MLPerf* 70B
@@ -408,18 +411,18 @@ Please refer to the MLPerf instructions for recreating the MLPerf numbers.
 ## Version
 
 ### Release Notes
+
 20240906a: Legacy quantization formats required `--quantization fp8_rocm` as a flag instead of `--quantization fp8`
 
 Updated:
 
-vLLM: https://github.com/ROCm/vllm/commit/2c60adc83981ada77a77b2adda78ef109d2e2e2b
+vLLM: <https://github.com/ROCm/vllm/commit/2c60adc83981ada77a77b2adda78ef109d2e2e2b>
+
 ### Docker Manifest
 
 To reproduce the release docker:
 
-```
-git clone https://github.com/ROCm/vllm.git
-cd vllm
-git checkout 2c60adc83981ada77a77b2adda78ef109d2e2e2b
-docker build -f Dockerfile.rocm -t <your_tag> --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 .
-```
+    git clone https://github.com/ROCm/vllm.git
+    cd vllm
+    git checkout 2c60adc83981ada77a77b2adda78ef109d2e2e2b
+    docker build -f Dockerfile.rocm -t <your_tag> --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 .