From 7144d20b8c5548f80997d331f25a5d0b31d9b03f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:43:35 -0400
Subject: [PATCH] Upstream sync 2024 07 01 (#350)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream sync 2024 07 01
(https://github.com/neuralmagic/nm-vllm/pull/350) - not a release
candidate

SUMMARY:

* Merge commits from
https://github.com/vllm-project/vllm/commit/6c916ac8a80d1b2f4e0d0113a67767dc254a3598
to
https://github.com/vllm-project/vllm/commit/8e0817c262da5c104f651a0ce4ac9ee0cd76f4ce
* Note that
https://github.com/vllm-project/vllm/commit/6c916ac8a80d1b2f4e0d0113a67767dc254a3598
is NOT included in this merge.

COMPARE vs UPSTREAM:


https://github.com/neuralmagic/nm-vllm/compare/upstream-sync-2024-07-01..8e0817c262da5c104f651a0ce4ac9ee0cd76f4ce

---------

Signed-off-by: kevin <kevin@anyscale.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
Signed-off-by: Stephanie <swang@anyscale.com>
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Signed-off-by: Joe Runde <joe@joerun.de>
Co-authored-by: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Chang Su <chang.s.su@oracle.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Woo-Yeon Lee <wooyeonlee0@gmail.com>
Co-authored-by: Jie Fu (傅杰) <jiefu@tencent.com>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Matt Wong <156021403+mawong-amd@users.noreply.github.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: aws-patlange <90803007+aws-patlange@users.noreply.github.com>
Co-authored-by: Stephanie Wang <swang@cs.berkeley.edu>
Co-authored-by: Stephanie <swang@anyscale.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: sasha0552 <admin@sasha0552.org>
Co-authored-by: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Abhinav Goyal <abhinav.goyal@flipkart.com>
Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
Co-authored-by: wangding zeng <155410488+zwd003@users.noreply.github.com>
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>, bong-furiosa <bongwon.jang@furiosa.ai>
Co-authored-by: mcalman <68564154+mcalman@users.noreply.github.com>
Co-authored-by: William Lin <SolitaryThinker@users.noreply.github.com>
Co-authored-by: Joe Runde <joe@joerun.de>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: llmpros <10524065+llmpros@users.noreply.github.com>
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: sang <sangcho@anyscale.com>
Co-authored-by: sroy745 <142070531+sroy745@users.noreply.github.com>
Co-authored-by: zhyncs <me@zhyncs.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: derekk-nm <derek@neuralmagic.com>
Co-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>
---
 .../configs/Meta-Llama-3-70B-Instruct.yaml    |  11 +
 .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml |  11 +
 .../configs/Meta-Llama-3-8B-Instruct.yaml     |  11 +
 .../configs/Mixtral-8x7B-Instruct-v0.1.yaml   |  11 +
 .../lm-eval-harness/configs/models-large.txt  |   2 +
 .../lm-eval-harness/configs/models-small.txt  |   2 +
 .../run-lm-eval-gsm-hf-baseline.sh            |  46 ++
 .../run-lm-eval-gsm-vllm-baseline.sh          |  51 ++
 .buildkite/lm-eval-harness/run-tests.sh       |  59 ++
 .../test_lm_eval_correctness.py               |  54 ++
 .buildkite/run-openvino-test.sh               |  14 +
 .buildkite/test-pipeline.yaml                 |  52 +-
 .buildkite/test-template-aws.j2               | 145 ----
 .github/scripts/run-tests                     |   3 +
 CMakeLists.txt                                |  23 +-
 Dockerfile.openvino                           |  26 +
 Dockerfile.ppc64le                            |  22 +
 Dockerfile.rocm                               | 215 +++--
 benchmarks/backend_request_func.py            |  12 +-
 benchmarks/benchmark_latency.py               |  13 +-
 benchmarks/benchmark_throughput.py            |   7 +-
 cmake/cpu_extension.cmake                     |  11 +-
 cmake/utils.cmake                             |  20 +-
 csrc/cpu/cpu_types.hpp                        | 514 +-----------
 csrc/cpu/cpu_types_vsx.hpp                    | 491 ++++++++++++
 csrc/cpu/cpu_types_x86.hpp                    | 515 ++++++++++++
 csrc/ops.h                                    |   4 +-
 csrc/punica/bgmv/bgmv_config.h                |   2 +
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 228 ++++--
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 139 +++-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  32 +-
 csrc/torch_bindings.cpp                       |   2 +-
 docs/source/conf.py                           |   1 +
 .../input_processing_pipeline.rst             |  20 +
 .../input_processing/model_inputs_index.rst   |  39 +
 .../dev/multimodal/multimodal_index.rst       |   8 +-
 .../getting_started/amd-installation.rst      |   6 +-
 docs/source/getting_started/debugging.rst     |  13 +-
 .../getting_started/openvino-installation.rst |  95 +++
 docs/source/index.rst                         |   3 +
 docs/source/models/adding_model.rst           |   4 +-
 docs/source/models/lora.rst                   |   3 +
 docs/source/models/supported_models.rst       |   6 +-
 docs/source/models/vlm.rst                    |  15 +
 docs/source/serving/distributed_serving.rst   |   7 +-
 docs/source/serving/env_vars.rst              |   5 +
 docs/source/serving/faq.rst                   |  12 +
 examples/api_client.py                        |   7 +-
 examples/llava_next_example.py                |  38 +
 examples/offline_inference_mlpspeculator.py   |   1 -
 examples/openai_vision_api_client.py          |  90 +++
 examples/phi3v_example.py                     |   7 +-
 pyproject.toml                                |   2 -
 requirements-common.txt                       |   2 +-
 requirements-cpu.txt                          |   5 +-
 requirements-cuda.txt                         |   2 +
 requirements-openvino.txt                     |   9 +
 requirements-test.txt                         |   6 +-
 setup.py                                      |  11 +-
 tests/async_engine/test_openapi_server_ray.py |  13 +-
 .../test_basic_correctness.py                 |  21 +-
 .../basic_correctness/test_chunked_prefill.py |  15 +-
 tests/basic_correctness/test_preemption.py    |  16 +-
 tests/conftest.py                             | 134 ++--
 .../test_basic_distributed_correctness.py     |  34 +-
 .../test_chunked_prefill_distributed.py       |  28 +-
 tests/distributed/test_comm_ops.py            |  79 +-
 tests/distributed/test_custom_all_reduce.py   |   5 +-
 .../distributed/test_multimodal_broadcast.py  |  56 ++
 tests/distributed/test_parallel_state.py      |  62 ++
 tests/distributed/test_pynccl.py              |  16 +-
 tests/distributed/test_shm_broadcast.py       |  49 +-
 tests/distributed/test_utils.py               |  17 +-
 tests/entrypoints/llm/__init__.py             |   0
 .../test_encode.py}                           |   4 +-
 .../test_generate.py}                         |   4 +-
 .../test_generate_multiple_loras.py}          |   6 +-
 tests/entrypoints/openai/__init__.py          |   0
 .../test_chat.py}                             | 579 +-------------
 tests/entrypoints/openai/test_completion.py   | 653 ++++++++++++++++
 .../test_embedding.py}                        |  39 +-
 .../{ => openai}/test_guided_processors.py    |   2 -
 tests/entrypoints/openai/test_models.py       |  75 ++
 .../test_oot_registration.py}                 |   2 -
 .../test_run_batch.py}                        |   0
 tests/entrypoints/openai/test_serving_chat.py |   2 -
 .../test_vision.py}                           |  19 +-
 tests/kernels/test_attention_selector.py      |   9 +-
 tests/kernels/test_cutlass.py                 | 136 +++-
 tests/kernels/test_moe.py                     |   3 +-
 tests/lora/conftest.py                        |   4 +-
 tests/lora/test_mixtral.py                    |   4 +-
 tests/lora/test_punica.py                     |   1 +
 tests/metrics/test_metrics.py                 |  12 +-
 tests/models/test_big_models.py               |  15 +-
 tests/models/test_compressed_tensors.py       |  55 ++
 tests/models/test_llava.py                    | 105 ++-
 tests/models/test_llava_next.py               |  48 +-
 tests/models/test_models.py                   |  16 +-
 tests/models/test_phi3v.py                    | 123 ++-
 tests/models/utils.py                         |  40 +-
 .../{test_processor.py => test_mapper.py}     |  95 ++-
 tests/quantization/test_compressed_tensors.py |  51 +-
 tests/quantization/test_fp8.py                |  17 +
 tests/quantization/utils.py                   |   3 +-
 tests/samplers/test_sampler.py                |   2 +-
 .../test_typical_acceptance_sampler.py        |  96 ++-
 .../e2e/test_integration_dist_tp2.py          | 116 +++
 ...n_dist.py => test_integration_dist_tp4.py} |  40 +-
 tests/spec_decode/e2e/test_mlp_correctness.py | 222 ++++++
 .../e2e/test_multistep_correctness.py         |  54 +-
 tests/spec_decode/test_dynamic_spec_decode.py |  15 +-
 tests/spec_decode/test_metrics.py             |  94 +--
 tests/spec_decode/test_multi_step_worker.py   |   3 +
 tests/spec_decode/test_spec_decode_worker.py  | 154 ++--
 tests/spec_decode/test_utils.py               |  22 +
 tests/spec_decode/utils.py                    |   5 +-
 tests/test_utils.py                           |  61 +-
 tests/tokenization/test_get_eos.py            |  38 +
 tests/tokenization/test_tokenizer_group.py    |  99 +++
 tests/utils.py                                |  51 +-
 tests/worker/test_model_input.py              | 158 ++++
 tests/worker/test_model_runner.py             |  57 +-
 vllm/_custom_ops.py                           |  10 +-
 vllm/attention/backends/abstract.py           |   6 +-
 vllm/attention/backends/blocksparse_attn.py   |   4 +-
 vllm/attention/backends/flash_attn.py         |   4 +-
 vllm/attention/backends/flashinfer.py         |  87 +-
 vllm/attention/backends/ipex_attn.py          |   4 +-
 vllm/attention/backends/openvino.py           | 101 +++
 vllm/attention/backends/pallas.py             |  21 +-
 vllm/attention/backends/rocm_flash_attn.py    |   4 +-
 vllm/attention/backends/torch_sdpa.py         |   4 +-
 vllm/attention/backends/xformers.py           |   4 +-
 vllm/attention/layer.py                       |  23 +-
 .../ops/blocksparse_attention/interface.py    |   6 +-
 vllm/attention/ops/prefix_prefill.py          |   4 +-
 vllm/attention/selector.py                    |  17 +-
 vllm/block.py                                 |  82 +-
 vllm/config.py                                | 166 +++-
 vllm/core/block_manager_v1.py                 |  19 +-
 .../device_communicators/custom_all_reduce.py |  58 +-
 .../custom_all_reduce_utils.py                |  11 +-
 .../device_communicators/pynccl.py            |  14 +-
 .../device_communicators/shm_broadcast.py     |  74 +-
 vllm/distributed/parallel_state.py            | 311 +++++++-
 vllm/engine/arg_utils.py                      | 130 ++-
 vllm/engine/async_llm_engine.py               |  16 +-
 vllm/engine/llm_engine.py                     |  86 +-
 vllm/engine/metrics.py                        | 293 ++++---
 vllm/entrypoints/openai/api_server.py         |  31 +-
 vllm/entrypoints/openai/protocol.py           |  57 +-
 vllm/entrypoints/openai/serving_completion.py |  32 +-
 vllm/entrypoints/openai/serving_embedding.py  |  26 +-
 vllm/entrypoints/openai/serving_engine.py     |  16 +-
 vllm/envs.py                                  |  25 +-
 vllm/executor/distributed_gpu_executor.py     |  16 +-
 vllm/executor/executor_base.py                |   4 +-
 vllm/executor/gpu_executor.py                 |   2 +-
 vllm/executor/multiproc_gpu_executor.py       |  16 +-
 vllm/executor/neuron_executor.py              |   9 +-
 vllm/executor/openvino_executor.py            | 163 ++++
 vllm/executor/ray_gpu_executor.py             |   5 +-
 vllm/executor/tpu_executor.py                 |  58 +-
 vllm/inputs/__init__.py                       |  19 +
 vllm/{inputs.py => inputs/data.py}            |  18 +-
 vllm/inputs/registry.py                       | 207 +++++
 vllm/lora/layers.py                           |   4 +
 vllm/lora/lora.py                             |   3 +-
 vllm/lora/models.py                           |  69 +-
 vllm/lora/punica.py                           |   3 +-
 .../layers/fused_moe/__init__.py              |   3 +-
 ...14336,device_name=AMD_Instinct_MI300X.json | 164 ++--
 ...=1792,device_name=AMD_Instinct_MI300X.json | 182 +++--
 ...=3584,device_name=AMD_Instinct_MI300X.json | 172 ++--
 ...=7168,device_name=AMD_Instinct_MI300X.json | 176 +++--
 .../layers/fused_moe/fused_moe.py             | 143 +++-
 vllm/model_executor/layers/layernorm.py       |  46 ++
 vllm/model_executor/layers/linear.py          | 136 ++--
 .../model_executor/layers/logits_processor.py |  10 +-
 .../model_executor/layers/quantization/awq.py |   3 +-
 .../layers/quantization/base_config.py        |   3 +-
 .../layers/quantization/bitsandbytes.py       |   2 +-
 .../compressed_tensors/compressed_tensors.py  |  47 +-
 .../compressed_tensors/schemes/__init__.py    |   5 +-
 .../schemes/compressed_tensors_scheme.py      |   8 +
 .../schemes/compressed_tensors_unquantized.py |   3 +
 .../schemes/compressed_tensors_w4a16_24.py    |   4 +
 .../schemes/compressed_tensors_w8a8.py        |  91 +--
 ...s_w4a16.py => compressed_tensors_wNa16.py} |   8 +-
 .../model_executor/layers/quantization/fp8.py |  72 +-
 .../layers/quantization/gptq_marlin.py        |   3 +-
 .../layers/quantization/squeezellm.py         |   3 +-
 .../layers/quantization/utils/marlin_utils.py |   3 +-
 .../layers/rejection_sampler.py               |  18 +-
 .../model_executor/layers/rotary_embedding.py | 136 ++++
 vllm/model_executor/layers/sampler.py         |   4 +-
 .../layers/spec_decode_base_sampler.py        |  15 +-
 .../layers/typical_acceptance_sampler.py      |  22 +-
 .../layers/vocab_parallel_embedding.py        |   4 +-
 vllm/model_executor/model_loader/loader.py    |  24 +-
 vllm/model_executor/model_loader/openvino.py  | 210 +++++
 vllm/model_executor/models/__init__.py        |   2 +
 vllm/model_executor/models/baichuan.py        |   9 +-
 vllm/model_executor/models/chatglm.py         |   9 +-
 vllm/model_executor/models/clip.py            |  77 +-
 vllm/model_executor/models/decilm.py          |   4 +-
 vllm/model_executor/models/deepseek_v2.py     | 534 +++++++++++++
 vllm/model_executor/models/gemma.py           |  33 +-
 vllm/model_executor/models/gemma2.py          | 401 ++++++++++
 vllm/model_executor/models/gpt_bigcode.py     |   7 +-
 vllm/model_executor/models/interfaces.py      | 144 ++++
 vllm/model_executor/models/llama.py           |   7 +-
 vllm/model_executor/models/llava.py           |  62 +-
 vllm/model_executor/models/llava_next.py      | 158 +++-
 vllm/model_executor/models/minicpm.py         |  10 +-
 vllm/model_executor/models/mixtral.py         |   7 +-
 vllm/model_executor/models/mlp_speculator.py  |  15 +-
 vllm/model_executor/models/phi.py             |  20 +-
 vllm/model_executor/models/phi3v.py           | 154 +++-
 vllm/model_executor/models/qwen2.py           |   8 +-
 vllm/model_executor/models/vlm_base.py        |  12 -
 vllm/model_executor/models/xverse.py          |   9 +-
 vllm/multimodal/__init__.py                   |  11 +-
 vllm/multimodal/base.py                       |  84 +-
 vllm/multimodal/image.py                      |  78 +-
 vllm/multimodal/registry.py                   | 133 +---
 vllm/multimodal/utils.py                      |  12 +-
 vllm/sampling_params.py                       |  29 +-
 vllm/sequence.py                              |  70 +-
 vllm/spec_decode/draft_model_runner.py        | 170 ++++
 vllm/spec_decode/metrics.py                   |  24 +-
 vllm/spec_decode/mlp_speculator_worker.py     |   3 +-
 vllm/spec_decode/multi_step_worker.py         |  38 +-
 vllm/spec_decode/proposer_worker_base.py      |   4 +-
 .../spec_decode/smaller_tp_proposer_worker.py | 149 ++++
 vllm/spec_decode/spec_decode_worker.py        |  77 +-
 vllm/transformers_utils/config.py             |  24 +-
 .../configs/mlp_speculator.py                 |   3 +
 vllm/transformers_utils/image_processor.py    |   4 -
 .../tokenizer_group/base_tokenizer_group.py   |   4 +
 .../tokenizer_group/ray_tokenizer_group.py    | 116 ++-
 vllm/utils.py                                 |  92 ++-
 vllm/worker/cpu_model_runner.py               | 191 +++--
 vllm/worker/cpu_worker.py                     |  86 +-
 vllm/worker/embedding_model_runner.py         | 142 ++--
 vllm/worker/model_runner.py                   | 740 +++++++++++-------
 vllm/worker/model_runner_base.py              | 158 ++++
 vllm/worker/neuron_model_runner.py            |  71 +-
 vllm/worker/neuron_worker.py                  |  39 +-
 vllm/worker/openvino_model_runner.py          | 330 ++++++++
 vllm/worker/openvino_worker.py                | 353 +++++++++
 vllm/worker/tpu_model_runner.py               | 140 +++-
 vllm/worker/tpu_worker.py                     | 167 +++-
 vllm/worker/worker.py                         | 135 +---
 vllm/worker/worker_base.py                    | 181 ++++-
 vllm/worker/xpu_model_runner.py               | 100 ++-
 257 files changed, 12471 insertions(+), 4350 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/models-large.txt
 create mode 100644 .buildkite/lm-eval-harness/configs/models-small.txt
 create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
 create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 create mode 100644 .buildkite/lm-eval-harness/run-tests.sh
 create mode 100644 .buildkite/lm-eval-harness/test_lm_eval_correctness.py
 create mode 100755 .buildkite/run-openvino-test.sh
 delete mode 100644 .buildkite/test-template-aws.j2
 create mode 100644 Dockerfile.openvino
 create mode 100644 Dockerfile.ppc64le
 create mode 100644 csrc/cpu/cpu_types_vsx.hpp
 create mode 100644 csrc/cpu/cpu_types_x86.hpp
 mode change 100755 => 100644 csrc/punica/bgmv/bgmv_config.h
 create mode 100644 docs/source/dev/input_processing/input_processing_pipeline.rst
 create mode 100644 docs/source/dev/input_processing/model_inputs_index.rst
 create mode 100644 docs/source/getting_started/openvino-installation.rst
 create mode 100644 docs/source/serving/faq.rst
 create mode 100644 examples/llava_next_example.py
 create mode 100644 examples/openai_vision_api_client.py
 create mode 100644 requirements-openvino.txt
 create mode 100644 tests/distributed/test_multimodal_broadcast.py
 create mode 100644 tests/distributed/test_parallel_state.py
 create mode 100644 tests/entrypoints/llm/__init__.py
 rename tests/entrypoints/{test_llm_encode.py => llm/test_encode.py} (98%)
 rename tests/entrypoints/{test_llm_generate.py => llm/test_generate.py} (98%)
 rename tests/entrypoints/{test_llm_generate_multiple_loras.py => llm/test_generate_multiple_loras.py} (95%)
 create mode 100644 tests/entrypoints/openai/__init__.py
 rename tests/entrypoints/{test_openai_server.py => openai/test_chat.py} (61%)
 create mode 100644 tests/entrypoints/openai/test_completion.py
 rename tests/entrypoints/{test_openai_embedding.py => openai/test_embedding.py} (74%)
 rename tests/entrypoints/{ => openai}/test_guided_processors.py (99%)
 create mode 100644 tests/entrypoints/openai/test_models.py
 rename tests/entrypoints/{test_server_oot_registration.py => openai/test_oot_registration.py} (98%)
 rename tests/entrypoints/{test_openai_run_batch.py => openai/test_run_batch.py} (100%)
 rename tests/entrypoints/{test_openai_vision.py => openai/test_vision.py} (95%)
 create mode 100644 tests/models/test_compressed_tensors.py
 rename tests/multimodal/{test_processor.py => test_mapper.py} (62%)
 create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py
 rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (66%)
 create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py
 create mode 100644 tests/tokenization/test_get_eos.py
 create mode 100644 tests/worker/test_model_input.py
 create mode 100644 vllm/attention/backends/openvino.py
 create mode 100644 vllm/executor/openvino_executor.py
 create mode 100644 vllm/inputs/__init__.py
 rename vllm/{inputs.py => inputs/data.py} (90%)
 create mode 100644 vllm/inputs/registry.py
 rename vllm/model_executor/layers/quantization/compressed_tensors/schemes/{compressed_tensors_w4a16.py => compressed_tensors_wNa16.py} (96%)
 create mode 100644 vllm/model_executor/model_loader/openvino.py
 mode change 100755 => 100644 vllm/model_executor/models/__init__.py
 create mode 100644 vllm/model_executor/models/deepseek_v2.py
 create mode 100644 vllm/model_executor/models/gemma2.py
 create mode 100644 vllm/model_executor/models/interfaces.py
 delete mode 100644 vllm/model_executor/models/vlm_base.py
 create mode 100644 vllm/spec_decode/draft_model_runner.py
 create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py
 create mode 100644 vllm/worker/model_runner_base.py
 create mode 100644 vllm/worker/openvino_model_runner.py
 create mode 100644 vllm/worker/openvino_worker.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 0000000000000..fa6ea236ef04f
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
new file mode 100644
index 0000000000000..02668702b83af
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 0000000000000..fb4b4915ab955
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
new file mode 100644
index 0000000000000..dec9164d1b84e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.632
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
new file mode 100644
index 0000000000000..127ec5d97bcff
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
new file mode 100644
index 0000000000000..273c5482db264
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
new file mode 100644
index 0000000000000..fdb8ec5393b36
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL,parallelize=True \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 0000000000000..a2876bade8893
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.2
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
new file mode 100644
index 0000000000000..b4fdde6dab425
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
new file mode 100644
index 0000000000000..975841dad1c29
--- /dev/null
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,54 @@
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
+"""
+
+import os
+from pathlib import Path
+
+import lm_eval
+import numpy
+import yaml
+
+RTOL = 0.02
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+
+
+def launch_lm_eval(eval_config):
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto")
+
+    return results
+
+
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)
+
+    # Confirm scores match ground truth.
+    for task in eval_config["tasks"]:
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
new file mode 100755
index 0000000000000..70e56596c4a86
--- /dev/null
+++ b/.buildkite/run-openvino-test.sh
@@ -0,0 +1,14 @@
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0b87e6280f0bb..d96e3c6d192e2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,7 +1,10 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template-aws.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
 
 steps:
 - label: Regression Test
@@ -24,7 +27,9 @@ steps:
 
 - label: Core Test
   mirror_hardwares: [amd]
-  command: pytest -v -s core
+  commands: 
+  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
@@ -39,19 +44,21 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ../.buildkite/download-images.sh
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -60,14 +67,12 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
@@ -77,8 +82,8 @@ steps:
   mirror_hardwares: [amd]
 
   commands:
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
@@ -186,6 +191,22 @@ steps:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
@@ -197,11 +218,12 @@ steps:
   gpu: a100
   num_gpus: 4
   commands: 
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s -x lora/test_mixtral.py
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
deleted file mode 100644
index 1a7fb44c2ecc5..0000000000000
--- a/.buildkite/test-template-aws.j2
+++ /dev/null
@@ -1,145 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        priority: 100
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel CPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-cpu
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-gpu
-    command: bash .buildkite/run-xpu-test.sh
-
-  {% for step in steps %}
-  {% if step.gpu == "a100" %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: a100-queue
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: {{ docker_image }}
-            command: ["bash"]
-            args:
-            - '-c'
-            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  {% else %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endif %}
-  {% endfor %}
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
index e2e12772d0816..bed217a40fe25 100755
--- a/.github/scripts/run-tests
+++ b/.github/scripts/run-tests
@@ -85,6 +85,9 @@ do
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"distributed/test_same_node"* ]]; then
         VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"distributed/test_multimodal_broadcast.py"* ]]; then
+        TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+        TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"distributed"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa15b632cdd3b..ede9192cd1dbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -32,8 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -98,18 +98,11 @@ elseif(HIP_FOUND)
   # .hip extension automatically, HIP must be enabled explicitly.
   enable_language(HIP)
 
-  # ROCm 5.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
-  endif()
-
-  # ROCm 6.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
new file mode 100644
index 0000000000000..9861997b451a9
--- /dev/null
+++ b/Dockerfile.openvino
@@ -0,0 +1,26 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
new file mode 100644
index 0000000000000..d4e4c483cada8
--- /dev/null
+++ b/Dockerfile.ppc64le
@@ -0,0 +1,22 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+# These packages will be in rocketce eventually
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 6bda696859c8b..1b89b892bbf1c 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,34 +1,35 @@
-# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-FROM $BASE_IMAGE
-
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-RUN echo "Base image is $BASE_IMAGE"
-
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
+# Default ROCm 6.1 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Tested and supported base rocm/pytorch images
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
+    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+
+# Whether to build CK-based flash-attention
+# If 0, will not build flash attention
+# This is useful for gfx target where flash-attention is not supported
+# (i.e. those that do not appear in `FA_GFX_ARCHS`)
+# Triton FA is used by default on ROCm now so this is unnecessary.
+ARG BUILD_FA="1"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-
 ARG FA_BRANCH="ae7928c"
-RUN echo "FA_BRANCH is $FA_BRANCH"
 
-# whether to build flash-attention
-# if 0, will not build flash attention
-# this is useful for gfx target where flash-attention is not supported
-# In that case, we need to use the python reference attention implementation in vllm
-ARG BUILD_FA="1"
-
-# whether to build triton on rocm
+# Whether to build triton on rocm
 ARG BUILD_TRITON="1"
+ARG TRITON_BRANCH="0ef1848"
 
-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+### Base image build stage
+FROM $BASE_IMAGE AS base
+
+# Import arg(s) defined before this build stage
+ARG PYTORCH_ROCM_ARCH
 
 # Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
 RUN apt-get update && apt-get install -y \
     curl \
     ca-certificates \
@@ -39,79 +40,165 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     wget \
     unzip \
-    nvidia-cuda-toolkit \
     tmux \
     ccache \
  && rm -rf /var/lib/apt/lists/*
 
-### Mount Point ###
-# When launching the container, mount the code directory to /app
+# When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace
-VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN pip install --upgrade pip
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
+# Install torch == 2.4.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
+        *"rocm-6.0"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
+        *"rocm-6.1"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+        *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 
-# Install ROCm flash-attention
-RUN if [ "$BUILD_FA" = "1" ]; then \
-    mkdir libs \
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV CCACHE_DIR=/root/.cache/ccache
+
+
+### AMD-SMI build stage
+FROM base AS build_amdsmi
+# Build amdsmi wheel always
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=/install
+
+
+### Flash-Attention wheel build stage
+FROM base AS build_fa
+ARG BUILD_FA
+ARG FA_GFX_ARCHS
+ARG FA_BRANCH
+# Build ROCm flash-attention wheel if `BUILD_FA = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_FA" = "1" ]; then \
+    mkdir -p libs \
     && cd libs \
     && git clone https://github.com/ROCm/flash-attention.git \
     && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
+    && git checkout "${FA_BRANCH}" \
     && git submodule update --init \
-    && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
-    && python3 setup.py install \
-    && cd ..; \
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
+            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
+        *) ;; esac \
+    && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
-# build triton
-RUN if [ "$BUILD_TRITON" = "1" ]; then \
+### Triton wheel build stage
+FROM base AS build_triton
+ARG BUILD_TRITON
+ARG TRITON_BRANCH
+# Build triton wheel if `BUILD_TRITON = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_TRITON" = "1" ]; then \
     mkdir -p libs \
     && cd libs \
-    && pip uninstall -y triton \
-    && git clone https://github.com/ROCm/triton.git \
-    && cd triton/python \
-    && pip3 install . \
-    && cd ../..; \
+    && git clone https://github.com/OpenAI/triton.git \
+    && cd triton \
+    && git checkout "${TRITON_BRANCH}" \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-WORKDIR /vllm-workspace
+
+### Final vLLM build stage
+FROM base AS final
+# Import the vLLM development directory from the build context
 COPY . .
 
-#RUN python3 -m pip install pynvml # to be removed eventually
-RUN python3 -m pip install --upgrade pip numba
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
+
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade numba scipy huggingface-hub[cli]
 
-# make sure punica kernels are built (for LoRA)
+# Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+# Silences the HF Tokenizers warning
+ENV TOKENIZERS_PARALLELISM=false
 
-ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
+RUN --mount=type=cache,target=${CCACHE_DIR} \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
-    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
-    && python3 setup.py install \
-    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
-    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
-    && cd ..
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.0"*) \
+            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
+        *"rocm-6.1"*) \
+            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
+            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
+            # Prevent interference if torch bundles its own HIP runtime
+            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
+        *) ;; esac \
+    && python3 setup.py clean --all \
+    && python3 setup.py develop
+
+# Copy amdsmi wheel into final image
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
+    mkdir -p libs \
+    && cp /install/*.whl libs \
+    # Preemptively uninstall to avoid same-version no-installs
+    && pip uninstall -y amdsmi;
 
+# Copy triton wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y triton; fi
+
+# Copy flash-attn wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y flash-attn; fi
+
+# Install wheels that were built to the final image
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if ls libs/*.whl; then \
+    pip install libs/*.whl; fi
 
 CMD ["/bin/bash"]
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4350b96b04a6a..fd2461ce58b93 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -265,6 +265,9 @@ async def async_request_openai_completions(
                         else:
                             data = json.loads(chunk)
 
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
@@ -273,12 +276,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
 
                                 # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
@@ -423,4 +422,5 @@ def get_tokenizer(
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
 }
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a4cf0632b7790..a46ee15817f4c 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -25,6 +25,8 @@ def main(args: argparse.Namespace):
         model=args.model,
         speculative_model=args.speculative_model,
         num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
         tokenizer=args.tokenizer,
         quantization=args.quantization,
         tensor_parallel_size=args.tensor_parallel_size,
@@ -127,6 +129,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
@@ -201,9 +207,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 2c6beb4e89672..a52e67bbbe7e3 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -349,9 +349,10 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 511e443f78403..690559ee265e9 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -46,6 +46,8 @@ is_avx512_disabled(AVX512_DISABLED)
 
 find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -68,8 +70,15 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+elseif (POWER9_FOUND OR POWER10_FOUND)
+    message(STATUS "PowerPC detected")
+    # Check for PowerPC VSX support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvsx"
+        "-mcpu=native"
+        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 071e16336dfa2..4869cad541135 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
   if (${GPU_LANG} STREQUAL "HIP")
     #
     # `GPU_ARCHES` controls the `--offload-arch` flags.
-    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
-    # via the `PYTORCH_ROCM_ARCH` env variable.
     #
-
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
     #
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
-
-    set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
-
     set(${GPU_ARCHES})
-    foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()
@@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 
     if(NOT ${GPU_ARCHES})
       message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
         " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
     endif()
 
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index d7621aaae81c9..0213be09105ed 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -2,514 +2,14 @@
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
-#include <immintrin.h>
-#include <torch/all.h>
-
-#ifndef __AVX2__
-static_assert(false, "AVX2 must be supported for the current implementation.");
-#endif
-
-namespace vec_op {
-
-// FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-#ifdef __AVX512FP16__
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128h reg;
-
-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
-
-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
-
-  explicit FP16Vec8(__m128h data) : reg(data) {}
-
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_add_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_div_ph(reg, b.reg));
-  }
-
-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
-};
-#endif
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128i reg;
-
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
-
-  explicit BF16Vec8(const FP32Vec8 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  __m256i reg;
-
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
-
-  explicit BF16Vec16(const FP32Vec16 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
-};
-
-#ifdef __AVX512F__
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m512i reg;
-
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
-
-  explicit BF16Vec32(__m512i data) : reg(data) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg((__m512i)_mm512_inserti32x4(
-            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
-                                                      (__m128i)vec8_data.reg),
-                                                  (__m128i)vec8_data.reg, 1),
-                               (__m128i)vec8_data.reg, 2),
-            (__m128i)vec8_data.reg, 3)) {}
-
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
-};
-#else
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m256i reg_low;
-  __m256i reg_high;
-
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
-
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
-
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
-  }
-};
-#endif
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  union AliasReg {
-    __m128 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m128 reg;
-
-  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
-
-  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
-
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
-
-  explicit FP32Vec4(__m128 data) : reg(data) {}
-
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  union AliasReg {
-    __m256 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m256 reg;
-
-  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
-
-  explicit FP32Vec8(__m256 data) : reg(data) {}
-
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-
-#ifdef __AVX512FP16__
-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
-
-  explicit FP32Vec8(const BF16Vec8 &v)
-      : reg(_mm256_castsi256_ps(
-            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
-                                  expf(ar.values[5]), expf(ar.values[4]),
-                                  expf(ar.values[3]), expf(ar.values[2]),
-                                  expf(ar.values[1]), expf(ar.values[0])));
-  }
-
-  FP32Vec8 tanh() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
-                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
-                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
-                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
-  }
-
-  FP32Vec8 er() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
-                                  erf(ar.values[5]), erf(ar.values[4]),
-                                  erf(ar.values[3]), erf(ar.values[2]),
-                                  erf(ar.values[1]), erf(ar.values[0])));
-  }
-
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_add_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_div_ps(reg, b.reg));
-  }
-
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
-};
-
-#ifdef __AVX512F__
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    __m512 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m512 reg;
-
-  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
-
-  explicit FP32Vec16(__m512 data) : reg(data) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg((__m512)_mm512_inserti32x4(
-            _mm512_inserti32x4(
-                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
-                                   (__m128i)data.reg, 1),
-                (__m128i)data.reg, 2),
-            (__m128i)data.reg, 3)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg((__m512)_mm512_inserti32x8(
-            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v)
-      : reg(_mm512_castsi512_ps(
-            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_add_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_div_ps(reg, b.reg));
-  }
-
-  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
-};
+#if defined(__x86_64__)
+  //x86 implementation
+  #include "cpu_types_x86.hpp"
+#elif defined(__POWER9_VECTOR__)
+  //ppc implementation
+  #include "cpu_types_vsx.hpp"
 #else
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  union AliasReg {
-    __m256 reg;
-    float values[8];
-  };
-
-  __m256 reg_low;
-  __m256 reg_high;
-
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
-
-  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
-        reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg_low(data.reg), reg_high(data.reg) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v) {
-    __m128i low = _mm256_extractf128_si256(v.reg, 0);
-    __m128i high = _mm256_extractf128_si256(v.reg, 1);
-
-    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
-    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
-
-    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
-    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
-
-    reg_low = _mm256_castsi256_ps(v_low_shifted);
-    reg_high = _mm256_castsi256_ps(v_high_shifted);
-  }
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
-                     _mm256_mul_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
-                     _mm256_add_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
-                     _mm256_sub_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
-                     _mm256_div_ps(reg_high, b.reg_high));
-  }
-
-  float reduce_sum() const {
-    FP32Vec8 low = FP32Vec8(reg_low);
-    FP32Vec8 high = FP32Vec8(reg_high);
-    return low.reduce_sum() + high.reduce_sum();
-  }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    float sum = 0.0;
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    uint32_t mask = base_mask << (idx * group_size);
-
-    AliasReg ar;
-
-    auto func = [&sum, &mask, &ar](int i) {
-      int flag = mask & 0x1;
-      mask = mask >> 1;
-      if (flag != 0) sum += ar.values[i];
-    };
-
-    ar.reg = reg_low;
-    unroll_loop<int, 8>(func);
-
-    ar.reg = reg_high;
-    unroll_loop<int, 8>(func);
-
-    return sum;
-  }
-
-  void save(float *ptr) const {
-    _mm256_storeu_ps(ptr, reg_low);
-    _mm256_storeu_ps(ptr + 8, reg_high);
-  }
-};
-#endif
-
-template <typename T> struct VecType { using vec_type = void; };
-
-template <typename T> using vec_t = typename VecType<T>::vec_type;
-
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
-
-#ifdef __AVX512FP16__
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+  #warning "unsupported vLLM cpu implementation"
 #endif
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
-
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
-
-#ifdef __AVX512FP16__
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-  acc = acc + a * b;
-}
-
-#ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
-
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
-}
-#else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(_mm256_cvtepi32_epi16(
-          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg(_mm512_cvtepi32_epi16(
-          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
-__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
-  __m256i ai = _mm256_castps_si256(a);
-  ai = _mm256_srli_epi32(ai, 16);
-  ai = _mm256_packus_epi32(ai, ai);
-  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
-  return _mm256_extracti128_si256(ai, 0);
-}
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
-  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
-  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
-  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
-}
-#endif // __AVX512F__
-#endif // __AVX512BF16__
-
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
-
-}; // namespace vec_op
-
 #endif
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
new file mode 100644
index 0000000000000..b50bdadc5713d
--- /dev/null
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -0,0 +1,491 @@
+
+#ifndef CPU_TYPES_VSX_HPP
+#define CPU_TYPES_VSX_HPP
+
+#include <altivec.h>
+#include <cmath>
+#include <torch/all.h>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short *)ptr);
+    vec_xst(reg.val[1], 16, (signed short *)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {}
+
+  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_mul(reg.val[0], b.reg.val[0]),
+        vec_mul(reg.val[1], b.reg.val[1]),
+        vec_mul(reg.val[2], b.reg.val[2]),
+        vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_add(reg.val[0], b.reg.val[0]),
+        vec_add(reg.val[1], b.reg.val[1]),
+        vec_add(reg.val[2], b.reg.val[2]),
+        vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_sub(reg.val[0], b.reg.val[0]),
+        vec_sub(reg.val[1], b.reg.val[1]),
+        vec_sub(reg.val[2], b.reg.val[2]),
+        vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_div(reg.val[0], b.reg.val[0]),
+        vec_div(reg.val[1], b.reg.val[1]),
+        vec_div(reg.val[2], b.reg.val[2]),
+        vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+#define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#ifndef _ARCH_PWR10
+const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
+const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
+const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
+const static __vector unsigned int one  = { 1, 1, 1, 1 };
+#endif
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[2];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  reg = vec_perm(ret[0], ret[1], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+#endif
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[4];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
+  reg.val[0] = vec_perm(ret[0], ret[1], omask);
+  reg.val[1] = vec_perm(ret[2], ret[3], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
+  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  lsb2 = vec_and(lsb2, one);
+  lsb3 = vec_and(lsb3, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  __vector unsigned int rnd2 = vec_add(lsb2, bias);
+  __vector unsigned int rnd3 = vec_add(lsb3, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  inp2 = vec_add(inp2, rnd2);
+  inp3 = vec_add(inp3, rnd3);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  inp2 = vec_sr(inp2, sh16);
+  inp3 = vec_sr(inp3, sh16);
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+#endif
+}
+
+inline void prefetch(const void *addr) {
+  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
+}
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
new file mode 100644
index 0000000000000..f50620a5287d4
--- /dev/null
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -0,0 +1,515 @@
+
+#ifndef CPU_TYPES_X86_HPP
+#define CPU_TYPES_X86_HPP
+
+#include <immintrin.h>
+#include <torch/all.h>
+
+#ifndef __AVX2__
+static_assert(false, "AVX2 must be supported for the current implementation.");
+#endif
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+#ifdef __AVX512FP16__
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128h reg;
+
+  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+
+  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+
+  explicit FP16Vec8(__m128h data) : reg(data) {}
+
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_add_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_div_ph(reg, b.reg));
+  }
+
+  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+};
+#endif
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+};
+
+#ifdef __AVX512F__
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m512i reg;
+
+  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+
+  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+};
+#else
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m256i reg_low;
+  __m256i reg_high;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+
+  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
+                                                  reg_high(high) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg_low((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)),
+        reg_high((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)) {}
+
+  void save(void *ptr) const {
+    *reinterpret_cast<__m256i *>(ptr) = reg_low;
+    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  }
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m128 reg;
+
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m256 reg;
+
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+
+#ifdef __AVX512FP16__
+  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+#endif
+
+  explicit FP32Vec8(const BF16Vec8 &v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+
+  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+
+#ifdef __AVX512F__
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m512 reg;
+
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+
+  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+};
+#else
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  union AliasReg {
+    __m256 reg;
+    float values[8];
+  };
+
+  __m256 reg_low;
+  __m256 reg_high;
+
+  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
+                                reg_high(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
+                         reg_high(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
+                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+
+  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
+                                              reg_high(data.reg_high) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg_low((__m256)_mm256_inserti128_si256(
+                _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)),
+        reg_high((__m256)_mm256_inserti128_si256(
+                 _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg_low(data.reg), reg_high(data.reg) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
+    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
+
+    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
+    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
+
+    reg_low = _mm256_castsi256_ps(v_low_shifted);
+    reg_high = _mm256_castsi256_ps(v_high_shifted);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
+                     _mm256_mul_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
+                     _mm256_add_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
+                     _mm256_sub_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
+                     _mm256_div_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_sum() const {
+    FP32Vec8 low = FP32Vec8(reg_low);
+    FP32Vec8 high = FP32Vec8(reg_high);
+    return low.reduce_sum() + high.reduce_sum();
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    float sum = 0.0;
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    uint32_t mask = base_mask << (idx * group_size);
+
+    AliasReg ar;
+
+    auto func = [&sum, &mask, &ar](int i) {
+      int flag = mask & 0x1;
+      mask = mask >> 1;
+      if (flag != 0) sum += ar.values[i];
+    };
+
+    ar.reg = reg_low;
+    unroll_loop<int, 8>(func);
+
+    ar.reg = reg_high;
+    unroll_loop<int, 8>(func);
+
+    return sum;
+  }
+
+  void save(float *ptr) const {
+    _mm256_storeu_ps(ptr, reg_low);
+    _mm256_storeu_ps(ptr + 8, reg_high);
+  }
+};
+#endif
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+#ifdef __AVX512FP16__
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+#endif
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+#ifdef __AVX512FP16__
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<_Float16 *>(ptr) = v;
+}
+#endif
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+#ifdef __AVX512BF16__
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+#else
+namespace{
+__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
+  __m256i ai = _mm256_castps_si256(a);
+  ai = _mm256_srli_epi32(ai, 16);
+  ai = _mm256_packus_epi32(ai, ai);
+  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
+  return _mm256_extracti128_si256(ai, 0);
+}
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
+  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
+  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
+}
+#endif // __AVX512F__
+#endif // __AVX512BF16__
+
+inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/ops.h b/csrc/ops.h
index 6f0a7143c9169..8a92afdc81a9b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <optional>
 #include <torch/library.h>
 
 void paged_attention_v1(
@@ -96,7 +97,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales);
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias);
 
 #endif
 
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
old mode 100755
new mode 100644
index cb6694b3036e9..2c8d007d8719f
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -86,6 +86,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 36864) \
     f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 49408) \
     f(in_T, out_T, W_T, narrow, 60544) \
     f(in_T, out_T, W_T, narrow, 60672) \
     f(in_T, out_T, W_T, narrow, 64000) \
@@ -182,6 +183,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 36864, narrow) \
     f(in_T, out_T, W_T, 43264, narrow) \
     f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 49408, narrow) \
     f(in_T, out_T, W_T, 60544, narrow) \
     f(in_T, out_T, W_T, 60672, narrow) \
     f(in_T, out_T, W_T, 64000, narrow) \
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 38a20a1727d18..6ce25c5ac897b 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel {
 };
 
 /*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch._scaled_mm.
-
-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
 template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue {
- private:
+struct ScaledEpilogueBase {
+ protected:
   using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
 
   using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
@@ -102,6 +90,32 @@ struct ScaledEpilogue {
 
   using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
       OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
       cutlass::multiplies, float, float,
@@ -134,6 +148,53 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
@@ -168,13 +229,13 @@ struct cutlass_2x_gemm {
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType = 
+  using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16, 
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16, 
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
       float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp, 
-      Arch, 
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
       TileShape, WarpShape, InstructionShape,
       EVTD,
       cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
@@ -404,14 +465,13 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -420,78 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (out.dtype() == torch::kBFloat16) {
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                      ScaledEpilogue>(out, a, b, a_scales,
-                                                      b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
+
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index cfa8f80f7ea04..b3f5b62086609 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -59,6 +59,28 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleBDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, float>;
+
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
+      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
 /*
    This epilogue function defines a quantized GEMM operation similar to
    torch.scaled_mm_.
@@ -76,21 +98,13 @@ struct enable_sm90_or_later : Kernel {
    per row or column.
 */
 template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue {
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleBDescriptor =
-      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
-          EpilogueDescriptor, float>;
-
-  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
-      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::multiplies, float, float,
@@ -120,6 +134,54 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using BiasDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, ElementD>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      BiasDescriptor::Stages, typename EpilogueDescriptor::TileShape, ElementD,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<ElementD>, false>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleA_Args = typename ScaleA::Arguments;
+    using ScaleB_Args = typename ScaleB::Arguments;
+    using Bias_Args = typename Bias::Arguments;
+
+    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    Bias_Args bias_args{static_cast<ElementD*>(bias.data_ptr())};
+
+    return ArgumentType{a_args, {b_args}, bias_args};
+  }
+};
+
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -440,41 +502,56 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                             Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
+        c, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
 #endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index f4e582d780ad9..81bf2d62d8f42 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -6,23 +6,27 @@
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -43,7 +47,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales) {
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias) {
   int32_t major_capability;
   int32_t minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@@ -66,6 +71,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
 
   if (version_num >= 90) {
@@ -73,19 +83,19 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
     // Guard against compilation issues for sm90 kernels
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
 #else
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
 #endif
   } else if (version_num == 89) {
     // Ada Lovelace
-    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
   } else if (version_num >= 80) {
     // Ampere
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
   } else {
     // Turing
     TORCH_CHECK(version_num >= 75);
-    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
   }
-}
+}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 227b69d79e863..faf29e1f1e01e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -142,7 +142,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ca26dcec4bb5a..af1f22b23c126 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,6 +66,7 @@
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
+    'use_edit_page_button': True,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
new file mode 100644
index 0000000000000..e0c773781115f
--- /dev/null
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -0,0 +1,20 @@
+.. _input_processing_pipeline:
+
+Input Processing Pipeline
+=========================
+
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
new file mode 100644
index 0000000000000..594edeb746bb4
--- /dev/null
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,39 @@
+.. _input_processing:
+
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+
+Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   input_processing_pipeline
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index a25eceecc276b..f6fdfc1debffb 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -12,10 +12,6 @@ By default, vLLM models do not support multi-modal inputs. To enable multi-modal
 you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
 as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
 
-.. contents::
-   :local:
-   :backlinks: none
-
 Module Contents
 +++++++++++++++
 
@@ -24,9 +20,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 61fcd45a26347..cc41d47296f8d 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -88,7 +88,7 @@ Option 2: Build from source
 - `Pytorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
 
 Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
 
@@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
 
     $ cd vllm
     $ pip install -U -r requirements-rocm.txt
-    $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
+    $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 
 
 .. tip::
 
     - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. 
+    - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
     - The ROCm version of pytorch, ideally, should match the ROCm driver version.
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index a22bba1478abb..4cd34769ecfb4 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -28,8 +28,8 @@ If it crashes, and the error trace shows somewhere around ``self.graph.replay()`
 
 Here are some common issues that can cause hangs:
 
-- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
-- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
+- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
 
 .. code-block:: python
 
@@ -41,7 +41,14 @@ Here are some common issues that can cause hangs:
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
-    assert value == dist.get_world_size()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
 
 .. tip::
 
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
new file mode 100644
index 0000000000000..0d8e0b680ff0d
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -0,0 +1,95 @@
+.. _installation_openvino:
+
+Installation with OpenVINO
+==========================
+
+vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (``--enable-prefix-caching``)
+- Chunked prefill (``--enable-chunked-prefill``)
+
+**Table of contents**:
+
+- :ref:`Requirements <openvino_backend_requirements>`
+- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
+- :ref:`Build from source <install_openvino_backend_from_source>`
+- :ref:`Performance tips <openvino_backend_performance_tips>`
+- :ref:`Limitations <openvino_backend_limitations>`
+
+.. _openvino_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Instruction set architecture (ISA) requirement: at least AVX2.
+
+.. _openvino_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+    $ docker run -it --rm vllm-openvino-env
+
+.. _install_openvino_backend_from_source:
+
+Install from source
+-------------------
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  .. code-block:: console
+
+      $ sudo apt-get update  -y
+      $ sudo apt-get install python3
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  .. code-block:: console
+
+      $ pip install --upgrade pip
+      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+- Finally, install vLLM with OpenVINO backend: 
+
+  .. code-block:: console
+
+      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+
+.. _openvino_backend_performance_tips:
+
+Performance tips
+----------------
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
+
+OpenVINO best known configuration is:
+
+.. code-block:: console
+
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+
+.. _openvino_backend_limitations:
+
+Limitations
+-----------
+
+- LoRA serving is not supported.
+
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+
+- Speculative sampling is not tested within vLLM integration.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 05133eb6d867a..e99a0a9a13899 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -63,6 +63,7 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
+   getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
@@ -83,6 +84,7 @@ Documentation
    serving/usage_stats
    serving/integrations
    serving/tensorizer
+   serving/faq
 
 .. toctree::
    :maxdepth: 1
@@ -120,6 +122,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index cbc8099e6f70f..f282b594590be 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 2278640481a91..934887a607a6a 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -4,6 +4,9 @@ Using LoRA adapters
 ===================
 
 This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+
 Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f4673dc27092f..544322582f8e9 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -55,6 +55,10 @@ Alongside each architecture, we include some popular models that use it.
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
     - ✅︎
+  * - :code:`Gemma2ForCausalLM`
+    - Gemma2
+    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
@@ -129,7 +133,7 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`Phi3ForCausalLM`
     - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
     -
   * - :code:`Phi3SmallForCausalLM`
     - Phi-3-Small
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 70ac82e2005b9..1837dd2aa89f7 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -5,6 +5,9 @@ Using VLMs
 
 vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
+.. important::
+    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
+
 Engine Arguments
 ----------------
 
@@ -39,6 +42,10 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
         image_feature_size=576,
     )
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
+
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
@@ -63,6 +70,9 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
+.. important::
+    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
+
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
 
@@ -89,6 +99,9 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
         --image-feature-size 576 \
         --chat-template template_llava.jinja
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
 To consume the server, you can use the OpenAI client like in the example below:
 
 .. code-block:: python
@@ -117,6 +130,8 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat response:", chat_response)
 
+A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
+
 .. note::
 
     By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index b0c45dbf70268..91f64ad2e9515 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -19,7 +19,7 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.api_server \
+    $ python -m vllm.entrypoints.openai.api_server \
     $     --model facebook/opt-13b \
     $     --tensor-parallel-size 4
 
@@ -35,4 +35,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime <https:/
     $ # On worker nodes
     $ ray start --address=<ray-head-address>
 
-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
\ No newline at end of file
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
+
+.. warning::
+    Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes.
diff --git a/docs/source/serving/env_vars.rst b/docs/source/serving/env_vars.rst
index 0ce1374a3967b..ff2259c0da3f1 100644
--- a/docs/source/serving/env_vars.rst
+++ b/docs/source/serving/env_vars.rst
@@ -3,6 +3,11 @@ Environment Variables
 
 vLLM uses the following environment variables to configure the system:
 
+.. warning::
+    Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
+
+    All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
+
 .. literalinclude:: ../../../vllm/envs.py
     :language: python
     :start-after: begin-env-vars-definition
diff --git a/docs/source/serving/faq.rst b/docs/source/serving/faq.rst
new file mode 100644
index 0000000000000..7b0374be8adff
--- /dev/null
+++ b/docs/source/serving/faq.rst
@@ -0,0 +1,12 @@
+Frequently Asked Questions
+===========================
+
+    Q: How can I serve multiple models on a single port using the OpenAI API?
+
+A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
+
+----------------------------------------
+
+    Q: Which model to use for offline inference embedding?
+
+A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
diff --git a/examples/api_client.py b/examples/api_client.py
index 70ec8c5492124..5f7daa14d5044 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -1,4 +1,9 @@
-"""Example Python client for vllm.entrypoints.api_server"""
+"""Example Python client for vllm.entrypoints.api_server
+NOTE: The API server is used only for demonstration and simple performance
+benchmarks. It is not intended for production use.
+For production use, we recommend vllm.entrypoints.openai.api_server
+and the OpenAI client API
+"""
 
 import argparse
 import json
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
new file mode 100644
index 0000000000000..e90a86abe41cb
--- /dev/null
+++ b/examples/llava_next_example.py
@@ -0,0 +1,38 @@
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+# Dynamic image input is currently not supported and therefore
+# a fixed image input shape and its corresponding feature size is required.
+# See https://github.com/vllm-project/vllm/pull/4199 for the complete
+# configuration matrix.
+
+llm = LLM(
+    model="llava-hf/llava-v1.6-mistral-7b-hf",
+    image_input_type="pixel_values",
+    image_token_id=32000,
+    image_input_shape="1,3,336,336",
+    image_feature_size=1176,
+)
+
+prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
+url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+image = Image.open(BytesIO(requests.get(url).content))
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
+
+outputs = llm.generate(
+    {
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    },
+    sampling_params=sampling_params)
+
+generated_text = ""
+for o in outputs:
+    generated_text += o.outputs[0].text
+
+print(f"LLM output:{generated_text}")
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
index 5448ec1f6208c..5dec4a76afb2f 100644
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
@@ -52,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
         speculative_model="ibm-fms/llama-13b-accelerator",
         # These are currently required for MLPSpeculator decoding
         use_v2_block_manager=True,
-        enforce_eager=True,
     )
 
     print("With speculation")
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
new file mode 100644
index 0000000000000..26f2aa651fca7
--- /dev/null
+++ b/examples/openai_vision_api_client.py
@@ -0,0 +1,90 @@
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+python -m vllm.entrypoints.openai.api_server \
+    --model llava-hf/llava-1.5-7b-hf \
+    --image-input-type pixel_values \
+    --image-token-id 32000 \
+    --image-input-shape 1,3,336,336 \
+    --image-feature-size 576 \
+    --chat-template template_llava.jinja
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+# Use image url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+
+# Use base64 encoded image in the payload
+def encode_image_base64_from_url(image_url: str) -> str:
+    """Encode an image retrieved from a remote url to base64 format."""
+
+    with requests.get(image_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+image_base64 = encode_image_base64_from_url(image_url=image_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 4f37c47ddca87..f0b9b0e4fc959 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -9,6 +9,11 @@
 
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -16,7 +21,7 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        disable_image_processor=False,
+        max_num_seqs=5,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
diff --git a/pyproject.toml b/pyproject.toml
index d1bcf81d3f763..119cb6a787a9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,8 +69,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "llm: run tests for vLLM API only",
-    "openai: run tests for OpenAI API only",
     "vlm: run tests for vision language models only",
 ]
 
diff --git a/requirements-common.txt b/requirements-common.txt
index 05969cfa5d65f..636f85343e1f2 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+transformers >= 4.42.0  # Required for Gemma 2.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 8b7d86e686217..754070df21c0a 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+torch == 2.3.1+cpu; platform_machine != "ppc64le"
+torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3536179835967..10596ed85d600 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,5 +5,7 @@
 ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
+# These must be updated alongside torch
+torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
 vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
new file mode 100644
index 0000000000000..e555d52572541
--- /dev/null
+++ b/requirements-openvino.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r requirements-common.txt
+
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+optimum-intel[openvino] >= 1.17.2
+
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-test.txt b/requirements-test.txt
index 0698539da2535..9e3a50ad0653c 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -18,12 +18,12 @@ peft
 requests==2.31  # required for python 3.8 testing
 ray
 sentence-transformers # required for embedding
-optimum     # required for hf gptq baselines
-auto-gptq   # required for hf gptq baselines
 torchvision # required for the image processor of phi3v
+git+https://github.com/vllm-project/llm-compressor.git # required for compressed-tensors
+compressed-tensors==0.4.0
 
 # Benchmarking
 aiohttp
 
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes==0.42.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e811485f90241..fc5f713b02396 100644
--- a/setup.py
+++ b/setup.py
@@ -258,6 +258,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+
+
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
@@ -381,6 +385,8 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
     elif _is_tpu():
         version += "+tpu"
     elif _is_cpu():
@@ -432,6 +438,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
@@ -440,7 +448,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
     return requirements
 
 
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index c5e5a42bba8e5..9106b9d914df5 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -5,27 +5,20 @@
 import ray
 
 from tests.nm_utils.utils_skip import should_skip_test_group
-from tests.utils import VLLM_PATH, RemoteOpenAIServer
+
+from ..utils import RemoteOpenAIServer
 
 if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
     pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
                 allow_module_level=True)
 
-# TODO: @robertgshaw2
-# Remove this once we land the single whl - failing due to size of workspace
-# being moved into ray.
-if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
-    pytest.skip(
-        "TEST_ENTRYPOINTS=DISABLE, skipping entrypoints engine test group",
-        allow_module_level=True)
-
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index fadc4998b4091..244004916c64c 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -2,7 +2,6 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
-import os
 import weakref
 
 import pytest
@@ -10,6 +9,8 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM
 
+from ..models.utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
     pytest.skip(
         "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
@@ -19,7 +20,6 @@
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 def test_vllm_gc_ed():
@@ -45,10 +45,6 @@ def test_models(
     max_tokens: int,
     enforce_eager: bool,
 ) -> None:
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
-
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
@@ -58,10 +54,9 @@ def test_models(
                      gpu_memory_utilization=0.7) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 8cb033edd25a3..8e89430c3adb4 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -10,6 +10,8 @@
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 
+from ..models.utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
     pytest.skip(
         "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
@@ -64,10 +66,9 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index f16c1fb48b833..fa4b76947393c 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -13,6 +13,8 @@
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
 
+from ..models.utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
     pytest.skip(
         "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
@@ -100,13 +102,13 @@ def test_preemption(
         total_preemption = (
             vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
     assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
             "is not enough KV cache space." in caplog_vllm.text)
     # Ensure the count bucket of request-level histogram metrics matches
diff --git a/tests/conftest.py b/tests/conftest.py
index f5ed54776dda9..52e67b165757b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,12 @@
 import contextlib
 import gc
 import os
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+from collections import UserList
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
+                    TypedDict, TypeVar)
 
 import pytest
 import torch
@@ -9,7 +14,7 @@
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
@@ -17,8 +22,12 @@
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalData
+else:
+    # it will call torch.cuda.device_count()
+    MultiModalData = None
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -28,21 +37,8 @@
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
-# Multi modal related
-# You can use `.buildkite/download-images.sh` to download the assets
-PIXEL_VALUES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
-]
-IMAGE_FEATURES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
-]
-IMAGE_FILES = [
-    os.path.join(_TEST_DIR, "images", filename)
-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
-]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
+_IMAGE_DIR = Path(_TEST_DIR) / "images"
+"""You can use `.buildkite/download-images.sh` to download the assets."""
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -51,6 +47,68 @@ def _read_prompts(filename: str) -> List[str]:
         return prompts
 
 
+@dataclass(frozen=True)
+class ImageAsset:
+    name: Literal["stop_sign", "cherry_blossom"]
+
+    @cached_property
+    def pixel_values(self) -> torch.Tensor:
+        return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
+
+    @cached_property
+    def image_features(self) -> torch.Tensor:
+        return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
+
+    @cached_property
+    def pil_image(self) -> Image.Image:
+        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
+
+    def for_hf(self) -> Image.Image:
+        return self.pil_image
+
+    def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from vllm.multimodal.image import ImageFeatureData  # noqa: F401
+        from vllm.multimodal.image import ImagePixelData
+        image_input_type = vision_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if image_input_type == ImageInputType.IMAGE_FEATURES:
+            return ImageFeatureData(self.image_features)
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            return ImagePixelData(self.pil_image)
+
+        raise NotImplementedError
+
+
+class _ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+
+
+# UPSTREAM SYNC: Error in Python3.8 without Any
+class _ImageAssets(UserList):
+
+    def __init__(self) -> None:
+        super().__init__(
+            [ImageAsset("stop_sign"),
+             ImageAsset("cherry_blossom")])
+
+    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+        """
+        Convenience method to define the prompt for each test image.
+
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+
+
+IMAGE_ASSETS = _ImageAssets()
+"""Singleton instance of :class:`_ImageAssets`."""
+
+
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
@@ -81,31 +139,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup()
 
 
-@pytest.fixture(scope="session")
-def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
-
-
-@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    if vision_language_config.image_input_type == (
-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
-    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
-
-
-@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
-
-
 @pytest.fixture
 def example_prompts() -> List[str]:
     prompts = []
@@ -122,6 +155,11 @@ def example_long_prompts() -> List[str]:
     return prompts
 
 
+@pytest.fixture(scope="session")
+def image_assets() -> _ImageAssets:
+    return IMAGE_ASSETS
+
+
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.half,
     "bfloat16": torch.bfloat16,
@@ -147,6 +185,7 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
+        is_compressed_tensors_model: bool = False,
         **kwargs,
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
@@ -165,6 +204,10 @@ def __init__(
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
+            elif is_compressed_tensors_model:
+                from llmcompressor.transformers import (  # noqa: E501
+                    SparseAutoModelForCausalLM)
+                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 
@@ -184,6 +227,9 @@ def __init__(
         )
 
         try:
+            # don't put this import at the top level
+            # it will call torch.cuda.device_count()
+            from transformers import AutoProcessor  # noqa: F401
             self.processor = AutoProcessor.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 0734a70a11f97..b99eab4f3aaa4 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -19,9 +19,11 @@
 import os
 
 import pytest
-import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
@@ -31,13 +33,12 @@
     "meta-llama/Llama-2-7b-hf",
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 @pytest.mark.skip("Upstream test that compares 'golden' results from fp16 "
                   "model with TP, which is an invalid test strategy due to "
                   "numerical precision on GPU.")
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -52,24 +53,23 @@ def test_models(
 ) -> None:
     distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
 
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    enforce_eager = backend_by_env_var == "FLASHINFER"
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=2,
-                     enforce_eager=enforce_eager,
                      distributed_executor_backend=distributed_executor_backend
                      ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 5e931284f1c4f..7e83a4005753c 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -18,9 +18,11 @@
 import os
 
 import pytest
-import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
@@ -35,7 +37,7 @@
 @pytest.mark.skip("Upstream test that compares 'golden' results from fp16 "
                   "model with TP, which is an invalid test strategy due to "
                   "numerical precision on GPU.")
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -58,8 +60,10 @@ def test_models(
     enable_chunked_prefill = True
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(
             model,
@@ -72,10 +76,12 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 894938e874092..712298916e223 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -9,12 +9,12 @@
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
-from tests.utils import (init_test_distributed_environment,
-                         multi_process_tensor_parallel)
-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
+from ..utils import init_test_distributed_environment, multi_process_parallel
+
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
                 allow_module_level=True)
@@ -109,6 +109,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -117,4 +179,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 3f6590b11aa0c..f56e5572a9ddd 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -13,8 +13,7 @@
                                              get_tp_group, graph_capture)
 
 from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+                     init_test_distributed_environment, multi_process_parallel)
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
@@ -133,4 +132,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
new file mode 100644
index 0000000000000..18423c3052f54
--- /dev/null
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -0,0 +1,56 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+The second test will hang if more than one test is run per command, so we need
+to run the tests one by one. The solution is to pass arguments (model name) by
+environment variables.
+
+Run:
+```sh
+TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
+    test_multimodal_broadcast.py
+TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
+    test_multimodal_broadcast.py
+```
+"""
+import os
+
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.utils import cuda_device_count_stateless
+
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
+model = os.environ["TEST_DIST_MODEL"]
+
+if model.startswith("llava-hf/llava"):
+    from ..models.test_llava import model_and_vl_config, run_test
+elif model.startswith("microsoft/Phi-3-vision"):
+    from ..models.test_phi3v import model_and_vl_config, run_test
+else:
+    raise NotImplementedError(f"Unsupported model: {model}")
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets,
+                tensor_parallel_size: int, dtype: str,
+                max_tokens: int) -> None:
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config=model_and_vl_config[0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
new file mode 100644
index 0000000000000..ac4cd52ba9fe3
--- /dev/null
+++ b/tests/distributed/test_parallel_state.py
@@ -0,0 +1,62 @@
+from typing import Any, Dict
+
+import pytest
+import torch
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.distributed.parallel_state import (_split_tensor_dict,
+                                             _update_nested_dict)
+
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
+
+def test_split_tensor_dict():
+    test_dict = {
+        "key_a": "a",
+        "key_b": torch.arange(8, dtype=torch.float32),
+        "key_c": {
+            "key_1": torch.arange(5, dtype=torch.float32),
+            "key_2": torch.tensor([], dtype=torch.float32),
+            "key_3": 123,
+        },
+        "key_d": {},
+    }
+    metadata_list, tensor_list = _split_tensor_dict(test_dict)
+    assert len(metadata_list) == 6
+    assert torch.allclose(tensor_list[0], test_dict["key_b"])
+    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
+    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
+
+
+def test_split_tensor_dict_invalid_key():
+    test_dict = {
+        "a%b": "a",
+    }
+    with pytest.raises(AssertionError):
+        _split_tensor_dict(test_dict)
+
+
+def test_update_nested_dict():
+    flattened_keys_values = [("key1%key2%key3", "value1"),
+                             ("key1%key2%key4", "value2"),
+                             ("key1%key5", "value3"), ("key6%key7", "value4"),
+                             ("key8", "value5")]
+    res: Dict[str, Any] = {}
+
+    for flat_key, value in flattened_keys_values:
+        _update_nested_dict(res, flat_key, value)
+    assert res == {
+        "key1": {
+            "key2": {
+                "key3": "value1",
+                "key4": "value2"
+            },
+            "key5": "value3"
+        },
+        "key6": {
+            "key7": "value4"
+        },
+        "key8": "value5"
+    }
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 9c4c5d3f43d3a..84b66da8a4c04 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -173,9 +173,13 @@ def send_recv_worker_fn():
                              dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     assert result == 1
 
@@ -208,9 +212,13 @@ def multiple_send_recv_worker_fn():
                              device=device)
     with pynccl_comm.change_state(enable=True):
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     if torch.distributed.get_rank() in [0, 2]:
         assert result == 1
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 4444cc6465d31..d90bf2daf5acb 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,7 +1,9 @@
 import multiprocessing
 import random
 import time
+from typing import List
 
+import numpy as np
 import pytest
 import torch.distributed as dist
 
@@ -15,6 +17,14 @@
                 allow_module_level=True)
 
 
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
 def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes = []
@@ -53,24 +63,31 @@ def wrapped_fn(env):
 def worker_fn():
     writer_rank = 2
     broadcaster = ShmRingBufferIO.create_from_process_group(
-        dist.group.WORLD, 1024, 2, writer_rank)
+        dist.group.WORLD, 1024 * 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        seed = random.randint(0, 1000)
+        dist.broadcast_object_list([seed], writer_rank)
+    else:
+        recv = [None]
+        dist.broadcast_object_list(recv, writer_rank)
+        seed = recv[0]  # type: ignore
+    dist.barrier()
+    # in case we find a race condition
+    # print the seed so that we can reproduce the error
+    print(f"Rank {dist.get_rank()} got seed {seed}")
+    # test broadcasting with about 400MB of data
+    N = 10_000
     if dist.get_rank() == writer_rank:
-        time.sleep(random.random())
-        broadcaster.broadcast_object(0)
-        time.sleep(random.random())
-        broadcaster.broadcast_object({})
-        time.sleep(random.random())
-        broadcaster.broadcast_object([])
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            broadcaster.broadcast_object(x)
+            time.sleep(random.random() / 1000)
     else:
-        time.sleep(random.random())
-        a = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        b = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        c = broadcaster.broadcast_object(None)
-        assert a == 0
-        assert b == {}
-        assert c == []
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            y = broadcaster.broadcast_object(None)
+            assert np.array_equal(x, y)
+            time.sleep(random.random() / 1000)
     dist.barrier()
 
 
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 8083bd099d2a5..2967c39924c1b 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,10 +1,10 @@
-import os
-
 import pytest
 import ray
 
+import vllm.envs as envs
 from tests.nm_utils.utils_skip import should_skip_test_group
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import (cuda_device_count_stateless, is_hip,
+                        update_environment_variables)
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
@@ -18,16 +18,21 @@ def get_count(self):
         return cuda_device_count_stateless()
 
     def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     def get_cuda_visible_devices(self):
-        return os.environ["CUDA_VISIBLE_DEVICES"]
+        return envs.CUDA_VISIBLE_DEVICES
 
 
 def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
-
+    if is_hip():
+        # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion
+        # is handled by `update_environment_variables`
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
         num_gpus=2).remote()
     assert sorted(ray.get(
diff --git a/tests/entrypoints/llm/__init__.py b/tests/entrypoints/llm/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/llm/test_encode.py
similarity index 98%
rename from tests/entrypoints/test_llm_encode.py
rename to tests/entrypoints/llm/test_encode.py
index 12a0a1a269ede..7c02a6e3d8d64 100644
--- a/tests/entrypoints/test_llm_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -10,7 +10,7 @@
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
                 allow_module_level=True)
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
@@ -30,8 +30,6 @@
     [1000, 1003, 1001, 1002],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/llm/test_generate.py
similarity index 98%
rename from tests/entrypoints/test_llm_generate.py
rename to tests/entrypoints/llm/test_generate.py
index 96b47fb5e170b..4f257e10310f2 100644
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,7 +6,7 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, RequestOutput, SamplingParams
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
@@ -28,8 +28,6 @@
     [0, 3, 1, 2],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
similarity index 95%
rename from tests/entrypoints/test_llm_generate_multiple_loras.py
rename to tests/entrypoints/llm/test_generate_multiple_loras.py
index 110f9c2aafafd..cbe1a79779a40 100644
--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -9,7 +9,7 @@
 from vllm import LLM
 from vllm.lora.request import LoRARequest
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
@@ -26,8 +26,6 @@
 
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
@@ -50,7 +48,7 @@ def llm():
     cleanup()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
diff --git a/tests/entrypoints/openai/__init__.py b/tests/entrypoints/openai/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/openai/test_chat.py
similarity index 61%
rename from tests/entrypoints/test_openai_server.py
rename to tests/entrypoints/openai/test_chat.py
index b55e517b43bac..f4c0af1adfdf9 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -14,13 +14,7 @@
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from tests.nm_utils.utils_skip import should_skip_test_group
-from tests.utils import VLLM_PATH, RemoteOpenAIServer
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
-    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
-                allow_module_level=True)
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -75,17 +69,15 @@
     "Swift", "Kotlin"
 ]
 
-pytestmark = pytest.mark.openai
-
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
@@ -120,155 +112,6 @@ def client(server):
     return server.get_async_client()
 
 
-async def test_check_models(client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # first test base model, then test loras
@@ -437,40 +280,6 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
@@ -591,189 +400,47 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             stream_options={"include_usage": True})
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-                # for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
+# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
+# (i.e. using the same ordering as in the Completions API tests), the test
+# will fail on the second `guided_decoding_backend` even when I swap their order
+# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str):
-    completion = await client.completions.create(
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {TEST_SCHEMA}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
                         guided_decoding_backend=guided_decoding_backend))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in TEST_CHOICE
 
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in TEST_CHOICE
+    assert choice1 != choice2
 
 
 @pytest.mark.asyncio
@@ -823,26 +490,6 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
     assert json1["age"] != json2["age"]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
@@ -882,30 +529,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in TEST_CHOICE
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str):
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -915,52 +539,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         "content":
         "The best language for type-safe systems programming is "
     }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice1 = chat_completion.choices[0].message.content
-    assert choice1 in TEST_CHOICE
-
-    messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice2 = chat_completion.choices[0].message.content
-    assert choice2 in TEST_CHOICE
-    assert choice1 != choice2
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
 
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
     with pytest.raises(openai.BadRequestError):
         _ = await client.chat.completions.create(model=MODEL_NAME,
                                                  messages=messages,
@@ -969,12 +548,6 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                                      2: "C++"
                                                  }))
 
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
@@ -1280,76 +853,6 @@ async def test_custom_role(client: openai.AsyncOpenAI):
     assert content1 == content2
 
 
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
-    simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=simple_sql_grammar))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(simple_sql_grammar)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
 @pytest.mark.asyncio
 async def test_long_seed(client: openai.AsyncOpenAI):
     for seed in [
@@ -1368,7 +871,3 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
         assert ("greater_than_equal" in exc_info.value.message
                 or "less_than_equal" in exc_info.value.message)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
new file mode 100644
index 0000000000000..189a48f86e84d
--- /dev/null
+++ b/tests/entrypoints/openai/test_completion.py
@@ -0,0 +1,653 @@
+# imports for guided decoding tests
+import json
+import re
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+import requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init()
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ])
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in TEST_CHOICE
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI):
+    simple_sql_grammar = """
+start: select_statement
+
+select_statement: "SELECT" column "from" table "where" condition
+
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+
+number: "1" | "2"
+"""
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=simple_sql_grammar))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(simple_sql_grammar)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    prompt = "This is a test prompt."
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(base_url + "detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+    assert response.json() == {"prompt": prompt}
diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/openai/test_embedding.py
similarity index 74%
rename from tests/entrypoints/test_openai_embedding.py
rename to tests/entrypoints/openai/test_embedding.py
index 8ace215c7fe71..9be0246e9e927 100644
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,10 +1,13 @@
+import base64
+
+import numpy as np
 import openai
 import pytest
 import ray
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
@@ -12,12 +15,10 @@
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
@@ -117,3 +118,33 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+                                      model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    responses_float = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float")
+
+    responses_base64 = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64")
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding),
+                          dtype="float").tolist())
+
+    assert responses_float.data[0].embedding == decoded_responses_base64_data[
+        0]
+    assert responses_float.data[1].embedding == decoded_responses_base64_data[
+        1]
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py
similarity index 99%
rename from tests/entrypoints/test_guided_processors.py
rename to tests/entrypoints/openai/test_guided_processors.py
index fe5c709b1db21..88688a8c2eac2 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
@@ -57,8 +57,6 @@
 TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
               r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
-pytestmark = pytest.mark.openai
-
 
 def test_guided_logits_processors():
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
new file mode 100644
index 0000000000000..e74a88c38f685
--- /dev/null
+++ b/tests/entrypoints/openai/test_models.py
@@ -0,0 +1,75 @@
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+from ...utils import RemoteOpenAIServer
+
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init()
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ])
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
similarity index 98%
rename from tests/entrypoints/test_server_oot_registration.py
rename to tests/entrypoints/openai/test_oot_registration.py
index 394594fcbf085..2263d620d0803 100644
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -15,8 +15,6 @@
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
                 allow_module_level=True)
 
-pytestmark = pytest.mark.openai
-
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
similarity index 100%
rename from tests/entrypoints/test_openai_run_batch.py
rename to tests/entrypoints/openai/test_run_batch.py
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index e3168f67e001f..18060e86c08d9 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -13,8 +13,6 @@
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 
-pytestmark = pytest.mark.openai
-
 
 @dataclass
 class MockModelConfig:
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/openai/test_vision.py
similarity index 95%
rename from tests/entrypoints/test_openai_vision.py
rename to tests/entrypoints/openai/test_vision.py
index 616c411714d89..5f3475eb7f945 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,5 +1,4 @@
-from pathlib import Path
-from typing import Dict
+from typing import Dict, List
 
 import openai
 import pytest
@@ -9,16 +8,16 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
     pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
                 allow_module_level=True)
 
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
-                       "examples/template_llava.jinja")
+LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
 assert LLAVA_CHAT_TEMPLATE.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -27,12 +26,10 @@
     "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
@@ -221,7 +218,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -284,7 +281,3 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
     )
     completion = completion.choices[0].text
     assert completion is not None and len(completion) >= 0
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 79358a2c47c68..bfc1090faa232 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -14,8 +14,8 @@
 
 
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
 def test_env(name: str, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
@@ -33,6 +33,11 @@ def test_env(name: str, device: str, monkeypatch):
             backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                         torch.float16, 16)
         assert backend.name == "ROCM_FLASH"
+    elif device == "openvino":
+        with patch("vllm.attention.selector.is_openvino", return_value=True):
+            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+                                        torch.float16, 16)
+        assert backend.name == "OPENVINO"
     else:
         backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                     torch.float16, 16)
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 6ece178b5cbe3..e9a02db1a0679 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,13 +2,14 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Type
+from typing import Optional, Type
 
 import pytest
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
+from vllm.utils import get_device_capability_stateless
 
 if should_skip_test_group(group_name="TEST_KERNELS"):
     pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
@@ -18,7 +19,7 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-capability = torch.cuda.get_device_capability()
+capability = get_device_capability_stateless()
 capability = capability[0] * 10 + capability[1]
 
 
@@ -32,11 +33,27 @@ def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
                             per_token_act_quant: bool,
                             per_out_channel_weight_quant: bool,
+                            use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -47,16 +64,19 @@ def cutlass_fp8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
 
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b * b.to(dtype=torch.float32)).to(out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    assert torch.allclose(out, baseline, rtol=1e-2, atol=1e-1)
+    assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
 
 
 def cutlass_int8_gemm_helper(m: int,
@@ -64,6 +84,7 @@ def cutlass_int8_gemm_helper(m: int,
                              k: int,
                              per_token_act_quant: bool,
                              per_out_channel_weight_quant: bool,
+                             use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -74,15 +95,18 @@ def cutlass_int8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
 
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=out_dtype)
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
@@ -92,6 +116,7 @@ def cutlass_int8_gemm_helper(m: int,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 # UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4
 #   to use the cutlass fp8 kernels + we do not have this in our
 #   automation system yet.
@@ -100,8 +125,8 @@ def cutlass_int8_gemm_helper(m: int,
                     "type because we need CUDA 12.4 + we do "
                     "not have this in automation yet.")
 def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                          per_out_ch: bool, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -109,23 +134,32 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                           per_out_ch: bool, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                        out_dtype: Type[torch.dtype]):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             out_dtype)
+                                        out_dtype: Type[torch.dtype],
+                                        use_bias: bool):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             use_bias,
+                             out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
 # UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4
 #   to use the cutlass fp8 kernels + we do not have this in our
 #   automation system yet.
@@ -134,13 +168,20 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
                     "type because we need CUDA 12.4 + we do "
                     "not have this in automation yet.")
 def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                       out_dtype: Type[torch.dtype]):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                            out_dtype)
+                                       out_dtype: Type[torch.dtype],
+                                       use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            per_act_token,
+                            per_out_ch,
+                            use_bias,
+                            out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 # UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4
 #   to use the cutlass fp8 kernels + we do not have this in our
@@ -150,18 +191,25 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
                     "type because we need CUDA 12.4 + we do "
                     "not have this in automation yet.")
 def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                  device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
+                                  use_bias: bool, device: str):
+    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
                             torch.bfloat16, device)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                   device: str):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             torch.bfloat16, device)
+                                   use_bias: bool, device: str):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             use_bias,
+                             out_dtype=torch.bfloat16,
+                             device=device)
 
 
 # For the following two tests:
@@ -171,6 +219,7 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # kernel must handle any M thrown at it.
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 # UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4
 #   to use the cutlass fp8 kernels + we do not have this in our
 #   automation system yet.
@@ -178,18 +227,23 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
                     reason="FP8 cutlass is not supported on this GPU "
                     "type because we need CUDA 12.4 + we do "
                     "not have this in automation yet.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                  use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                    use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                     use_bias)
 
 
 # Test working with a subset of A and B
@@ -210,9 +264,11 @@ def test_cutlass_subset():
                                 scale_a,
                                 scale_b,
                                 out_dtype=torch.bfloat16)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
 
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 6b5f99dcb3de7..08360ba405a67 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -34,8 +34,7 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-# UPSTREAM SYNC: breaks NM automation.
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", [8, 64])
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4eab73a71071c..bda123bf13139 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -165,7 +165,9 @@ def sql_lora_files():
 
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
-    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+    # Note: this module has incorrect adapter_config.json to test
+    # https://github.com/vllm-project/vllm/pull/5909/files.
+    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index f2c360a9a8100..0a6b24d7e6421 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -47,14 +47,14 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                    enable_lora=True,
                    max_num_seqs=16,
                    max_loras=4,
+                   distributed_executor_backend="ray",
                    tensor_parallel_size=tp_size)
 
     expected_lora_output = [
         "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
         "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
-
     assert do_sample(llm, mixtral_lora_files,
                      lora_id=1) == expected_lora_output
     assert do_sample(llm, mixtral_lora_files,
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index 905bae3cbea59..5a366ecaa714a 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -116,6 +116,7 @@ def _lora_ref_impl(
     36864,
     43264,
     49152,
+    49408,
     60544,
     60672,
     64000,
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e953f7226b5af..509cb62c17b88 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -44,7 +44,7 @@ def test_metric_counter_prompt_tokens(
         vllm_prompt_token_count = sum(prompt_token_counts)
 
         _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
             **stat_logger.labels)._value.get()
 
@@ -69,7 +69,7 @@ def test_metric_counter_generation_tokens(
                      gpu_memory_utilization=0.4) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
             **stat_logger.labels)._value.get()
         vllm_generation_count = 0
@@ -97,7 +97,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                      disable_log_stats=False,
                      gpu_memory_utilization=0.3,
                      served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
@@ -177,10 +177,10 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
         with pytest.raises(AttributeError):
-            _ = engine.stat_logger
+            _ = engine.stat_loggers
     else:
-        assert (engine.stat_logger
-                is not None), "engine.stat_logger should be set"
+        assert (engine.stat_loggers
+                is not None), "engine.stat_loggers should be set"
         # Ensure the count bucket of request-level histogram metrics matches
         # the number of requests as a simple sanity check to ensure metrics are
         # generated
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 962af078618fa..b709f2863e617 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -12,6 +12,8 @@
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 
+from .utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_MODELS"):
     pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
                 allow_module_level=True)
@@ -77,13 +79,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.skip("Slow and not useful (just prints model).")
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
new file mode 100644
index 0000000000000..729c08be5dbbe
--- /dev/null
+++ b/tests/models/test_compressed_tensors.py
@@ -0,0 +1,55 @@
+"""Compares vllm vs sparseml for compressed-tensors
+
+Note: vllm and sparseml do not have bitwise correctness, 
+so in this test, we just confirm that the top selected 
+tokens of the are in the top 5 selections of each other.
+"""
+
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from tests.quantization.utils import is_quant_method_supported
+
+from .utils import check_logprobs_close
+
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
+]
+
+MAX_TOKENS = 32
+NUM_LOGPROBS = 5
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("compressed-tensors"),
+    reason="compressed-tensors is not supported on this machine type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    vllm_runner,
+    hf_runner,
+    example_prompts,
+    model_name,
+) -> None:
+    # Run sparseml.
+    with hf_runner(
+            model_name=model_name,
+            is_compressed_tensors_model=True) as compressed_tensors_models:
+
+        ct_outputs = compressed_tensors_models.generate_greedy_logprobs_limit(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    # Run vllm.
+    with vllm_runner(model_name=model_name) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=ct_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="compressed-tensors",
+        name_1="vllm",
+    )
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index fcedd695e9035..1018806232dde 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
@@ -6,7 +6,8 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import VisionLanguageConfig
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_MODELS"):
     pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
@@ -15,12 +16,12 @@
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
     "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
     "<image>\nUSER: What is the season?\nASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+})
 
 
 def iter_llava_configs(model_name: str):
@@ -54,28 +55,33 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model_and_config: Tuple[str, VisionLanguageConfig],
+    *,
+    dtype: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -86,30 +92,61 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(model_id,
                      dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
+
+        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
+        # we must put it inside the vllm_runner context manager
+        # i.e. after creating vLLM instance.
+        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+
+        vllm_image_prompts = [
+            p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+            for p in HF_IMAGE_PROMPTS
+        ]
+
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
                                                   images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index ffcaae3242887..38f4d98724082 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -6,7 +6,8 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import VisionLanguageConfig
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS
+from .utils import check_outputs_equal
 
 pytestmark = pytest.mark.vlm
 
@@ -20,12 +21,12 @@
     "questions.")
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
-    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
-    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    f"{_PREFACE} <image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    f"{_PREFACE} <image>\nUSER: What is the season?\nASSISTANT:",
+})
 
 
 def iter_llava_next_configs(model_name: str):
@@ -61,20 +62,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, " ")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 @pytest.mark.skip("Failing in NM Automation due to writing to file without "
@@ -85,8 +86,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -97,6 +98,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
+    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
@@ -120,11 +123,12 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                                                   max_tokens,
                                                   images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index d856caec8abdd..aa9c61e8aea43 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -11,6 +11,8 @@
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 
+from .utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_MODELS"):
     pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
                 allow_module_level=True)
@@ -25,6 +27,7 @@
     "stabilityai/stablelm-3b-4e1t",
     # "allenai/OLMo-1B",  # Broken
     "bigcode/starcoder2-3b",
+    "google/gemma-1.1-2b-it",
 ]
 
 
@@ -50,13 +53,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.skip("Slow and not useful (just prints model).")
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index ad8d178c650eb..a644397257de0 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
@@ -7,7 +7,8 @@
 from vllm.config import VisionLanguageConfig
 from vllm.utils import is_cpu
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_MODELS"):
     pytest.skip("TEST_MODELS=DISABLE, skipping models test group",
@@ -16,17 +17,18 @@
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+    "cherry_blossom":
+    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",  # noqa: E501
+})
 
 
 def iter_phi3v_configs(model_name: str):
     image_hw_to_feature_size = {
         (1008, 1344): 1921,
+        (2016, 2688): 1933,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -54,22 +56,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id if input_id != image_token_id else 0
-        for idx, input_id in enumerate(input_ids)
+    hf_output_ids = [
+        token_id if token_id != image_token_id else 0
+        for idx, token_id in enumerate(output_ids)
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "") \
         .replace("<s>", " ").replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 target_dtype = "half"
@@ -77,14 +79,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     target_dtype = "bfloat16"
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-# Since we use _attn_implementation="eager" for hf_runner, here is
-# numeric difference for longer context and test can't pass
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model_and_config: Tuple[str, VisionLanguageConfig],
+    *,
+    dtype: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -95,6 +100,35 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with vllm_runner(model_id,
+                     max_model_len=2048,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     enforce_eager=True,
+                     distributed_executor_backend=distributed_executor_backend,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
+        # we must put it inside the vllm_runner context manager
+        # i.e. after creating vLLM instance.
+
+        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+
+        vllm_image_prompts = [
+            p.replace("<|image_1|>",
+                      "<|image|>" * vlm_config.image_feature_size + "<s>")
+            for p in HF_IMAGE_PROMPTS
+        ]
+
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
@@ -106,26 +140,33 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             images=hf_images,
             eos_token_id=hf_model.processor.tokenizer.eos_token_id)
 
-    vllm_image_prompts = [
-        p.replace("<|image_1|>",
-                  "<|image|>" * vlm_config.image_feature_size + "<s>")
-        for p in HF_IMAGE_PROMPTS
-    ]
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
 
-    with vllm_runner(model_id,
-                     max_model_len=2048,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+# Since we use _attn_implementation="eager" for hf_runner, here is
+# numeric difference for longer context and test can't pass
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3e49dfb331176..0d5e304d84463 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,7 +1,43 @@
-def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
-    """Compare the logprobs of two sequences generated by different models, 
+from typing import Dict, List, Tuple
+
+TokensText = Tuple[List[int], str]
+
+
+def check_outputs_equal(outputs_0_lst: List[TokensText],
+                        outputs_1_lst: List[TokensText], name_0: str,
+                        name_1: str):
+    """
+    Compare the two sequences generated by different models, 
+    which should be equal.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
+                                              f"\n{name_0}:\t{output_str_0!r}"
+                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
+                                              f"\n{name_0}:\t{output_str_0!r}"
+                                              f"\n{name_1}:\t{output_str_1!r}")
+
+
+TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
+
+
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
+                         outputs_1_lst: List[TokensTextLogprobs], name_0: str,
+                         name_1: str):
+    """
+    Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
     """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
     # Loop through responses to each prompt.
     for prompt_idx, (outputs_0,
                      outputs_1) in enumerate(zip(outputs_0_lst,
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_mapper.py
similarity index 62%
rename from tests/multimodal/test_processor.py
rename to tests/multimodal/test_mapper.py
index 51c352361702a..2c05b0edb0c48 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_mapper.py
@@ -10,7 +10,7 @@
 
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(hf_images, dtype):
+def test_clip_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -25,25 +25,24 @@ def test_clip_image_processor(hf_images, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ),
     )
 
-    for image in hf_images:
+    for asset in image_assets:
         hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            ImagePixelData(asset.pil_image),
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -59,7 +58,7 @@ def test_clip_image_processor(hf_images, dtype):
     reason="Inconsistent image processor being used due to lack "
     "of support for dynamic image token replacement")
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(hf_images, dtype):
+def test_llava_next_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -74,25 +73,24 @@ def test_llava_next_image_processor(hf_images, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=64000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=2928,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=64000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=2928,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ),
     )
 
-    for image in hf_images:
+    for asset in image_assets:
         hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            ImagePixelData(asset.pil_image),
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -107,7 +105,7 @@ def test_llava_next_image_processor(hf_images, dtype):
 @pytest.mark.xfail(
     reason="Example image pixels were not processed using HuggingFace")
 @pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+def test_image_pixel_types(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -119,26 +117,23 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
-
-    for image, tensor in zip(hf_images, vllm_image_tensors):
-        image_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ))
+
+    for asset in image_assets:
+        image_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            ImagePixelData(asset.pil_image),
         )
-        tensor_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(tensor),
-            model_config=model_config,
-            vlm_config=vlm_config,
+        tensor_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            ImagePixelData(asset.pixel_values),
         )
 
         assert image_result.keys() == tensor_result.keys()
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 9c96d16c81c88..b6966221012f8 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,9 +9,11 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationType)
 
 if should_skip_test_group(group_name="TEST_QUANTIZATION"):
     pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group",
@@ -19,11 +21,13 @@
 
 
 @pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
+     QuantizationType.INT, 2560),
+    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
+     QuantizationType.INT, 2560),
 ])
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
-    model_path, strategy = model_args
+    model_path, strategy, quant_type, shape_0 = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -39,17 +43,23 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
                           CompressedTensorsLinearMethod)
         assert isinstance(down_proj.quant_method,
                           CompressedTensorsLinearMethod)
-
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
         assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.weight.dtype is torch.int8
-        assert o_proj.weight.dtype is torch.int8
-        assert gate_up_proj.weight.dtype is torch.int8
+        expected_type = (torch.int8 if quant_type == QuantizationType.INT else
+                         torch.float8_e4m3fn)
+
+        assert qkv_proj.weight.dtype is expected_type
+        assert o_proj.weight.dtype is expected_type
+        assert gate_up_proj.weight.dtype is expected_type
 
         if qkv_proj.scheme.strategy == "tensor":
-            assert qkv_proj.weight_scale.shard_splitter is not None
-            assert qkv_proj.weight_scale.logical_widths is not None
+            # Make sure it is a channelwise buffer
+            # After running process_weights_after_loading
+            assert len(qkv_proj.weight_scale.shape) == 2
+            assert qkv_proj.weight_scale.shape[0] == shape_0
+            assert qkv_proj.weight_scale.shape[1] == 1
+        assert qkv_proj.weight_scale.dtype is torch.float32
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
@@ -79,26 +89,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         assert qkv_proj.weight.dtype is torch.int8
 
 
-@pytest.mark.parametrize("w4a16_args", [
-    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
-    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
-])
-def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
-    model, strategy, group = w4a16_args
+@pytest.mark.parametrize(
+    "wNa16_args",
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
+    model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
         qkv_proj = layer.self_attn.qkv_proj
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.scheme.group_size == group
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == 8
+        assert qkv_proj.weight_packed.pack_factor == pack_factor
 
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 583c431001f33..6d801ae36d6dd 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -14,6 +14,23 @@
     pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group",
                 allow_module_level=True)
 
+MODELS = [
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+def test_model_load_and_run(vllm_runner, model: str):
+    with vllm_runner(model) as llm:
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 29085916afb4d..5c1b5ad9b9c07 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,6 +1,7 @@
 import torch
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import get_device_capability_stateless
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
@@ -8,7 +9,7 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not torch.cuda.is_available():
         return False
 
-    capability = torch.cuda.get_device_capability()
+    capability = get_device_capability_stateless()
     capability = capability[0] * 10 + capability[1]
     return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index c72ec11b5cb0d..369711abbdc24 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -593,7 +593,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     generation_config = GenerationConfig(top_k=top_k,
                                          top_p=top_p,
                                          do_sample=True)
-    warpers = generation_model._get_logits_warper(generation_config)
+    warpers = generation_model._get_logits_warper(generation_config, device)
     assert len(warpers) == 2  # top_p and top_k
 
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 93cff2c3c9f7d..b62df79d7a33d 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -57,6 +57,19 @@ def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
     return draft_token_ids
 
 
+def get_acceptance_sampler(
+    posterior_threshold: float = 0.03,
+    posterior_alpha: float = 0.9,
+    disable_bonus_tokens: bool = False,
+    strict_mode: bool = False,
+) -> TypicalAcceptanceSampler:
+    """
+    Initializes and returns a TypicalAcceptanceSampler.
+    """
+    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
+                                    disable_bonus_tokens, strict_mode)
+
+
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@@ -69,7 +82,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
     different combinations of k, vocab_size, batch_size and num devices.
     """
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler()
+    typical_acceptance_sampler = get_acceptance_sampler()
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -81,7 +94,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
     # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids)
+    typical_acceptance_sampler(target_probs,
+                               bonus_token_ids,
+                               draft_probs=None,
+                               draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@@ -99,7 +115,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True)
+    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -130,8 +146,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_probs, bonus_token_ids,
-                                   draft_token_ids)
+        typical_acceptance_sampler(target_probs,
+                                   bonus_token_ids,
+                                   draft_probs=None,
+                                   draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("seed", list(range(10)))
@@ -156,7 +174,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -168,9 +186,11 @@ def test_uniform_target_distribution_accepts_all_tokens(
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # We are using a uniform target probability distribution.
     # For a uniform distribution the entropy is very high and it
     # should lead to all draft tokens being accepted. Verify that.
@@ -208,7 +228,7 @@ def test_temperature_zero_target_distribution(seed: int,
     vocab_size = 30_000
     torch.set_default_device(device)
 
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target probabilities
@@ -229,9 +249,11 @@ def test_temperature_zero_target_distribution(seed: int,
     # 1.0 tokens in the target distribution we will reject all of them and
     # fallback to the greedy sampling for selecting 1 token for each sequence.
     # Verify the same.
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, -1] == -1)
@@ -266,7 +288,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
     batch_size = 4
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # For sequences 0 and 2 set the distribution to a temperature
@@ -282,9 +304,11 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # verify the shape of output_token_ids
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
@@ -331,7 +355,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Create a temperature zero target probability distribution and ensure
@@ -344,9 +368,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -362,9 +388,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
         batch_size, k, vocab_size, zero_temperature_token_ids)
     draft_token_ids = torch.cat(
         (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
@@ -389,7 +417,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target
@@ -407,9 +435,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 1:-1] == -1)
@@ -423,9 +453,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
         posterior_threshold=0.0,
         posterior_alpha=0.0)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -456,7 +488,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
new file mode 100644
index 0000000000000..86fd88d57967c
--- /dev/null
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -0,0 +1,116 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_draft_tensor_parallel_size": 1,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
similarity index 66%
rename from tests/spec_decode/e2e/test_integration_dist.py
rename to tests/spec_decode/e2e/test_integration_dist_tp4.py
index 80cfb7eb7b7d9..636c8e4b67cce 100644
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -6,7 +6,6 @@
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
-from vllm.utils import is_hip
 
 from .conftest import run_greedy_equality_correctness_test
 
@@ -15,11 +14,13 @@
                 allow_module_level=True)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
         "model": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
@@ -27,7 +28,7 @@
 
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
+        "tensor_parallel_size": 4,
 
         # Use AsyncLLM engine, so that the engine runs in its own process.
         # Otherwise, since vLLM does not follow true SPMD, the test runner
@@ -36,35 +37,30 @@
         # second run of the test to fail with internal NCCL error.
         "use_async": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
         "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
     },
 ])
-@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
-    "output_len",
+    "test_llm_kwargs",
     [
-        # Use smaller output len for fast test.
-        32,
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
     ])
+@pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
-    """Verify greedy equality when tensor parallelism is used.
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
     """
-    if is_hip():
-        pytest.skip("hip is not well-supported yet")
     run_greedy_equality_correctness_test(baseline_llm_generator,
                                          test_llm_generator,
                                          batch_size,
-                                         max_output_len=output_len,
+                                         max_output_len=32,
                                          force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
new file mode 100644
index 0000000000000..09be2ca8c96b4
--- /dev/null
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -0,0 +1,222 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, MLPSpeculator would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+from .conftest import run_greedy_equality_correctness_test
+
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
+# main model
+MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
+
+# speculative model
+SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
+
+# max. number of speculative tokens: this corresponds to
+# n_predict in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index e9814d81d8f8d..6ed842d3f355f 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -11,9 +11,15 @@
 numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
 equality. This gives us good coverage of temp=0.
 
+At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
+highest probability in the target distribution are accepted. Therefore, we can 
+expect greedy equality for the TypicalAcceptanceSampler at temp=0.
+
 For temp>0, we rely on unit tests on the rejection sampler to verify that the
 output distribution is the same with spec decode vs. no spec decode (this would
-be prohibitively expensive to run with a real model).
+be prohibitively expensive to run with a real model). Similarly, for the
+TypicalAcceptance sampler also, we rely on unit tests to validate temp>0
+test cases.
 
 NOTE: Speculative decoding's distribution equality requires that the measured
 distributions of the target model and proposal model be deterministic given the
@@ -616,3 +622,49 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
                                          batch_size,
                                          max_output_len=output_len,
                                          force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+            "spec_decoding_acceptance_method": "typical_acceptance_sampler"
+        }
+        # Try a range of common k.
+        for k in [1, 2, 3]
+    ])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_typical_acceptance_sampling(baseline_llm_generator,
+                                     test_llm_generator, batch_size: int,
+                                     output_len: int):
+    """Verify that speculative decoding produces exact equality to without spec
+    decode with TypicalAcceptanceSampler as the draft token acceptance
+    sampling method.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index afb73c8a92a03..920b25c7d9b9d 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -4,13 +4,13 @@
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, mock_worker
 
 if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
@@ -21,23 +21,22 @@
 @pytest.mark.parametrize('queue_size', [4])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('k', [1])
-@pytest.mark.parametrize('queue_size', [4])
-@pytest.mark.parametrize('batch_size', [1])
-@pytest.mark.parametrize('k', [1])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
+                             acceptance_sampler_method: str):
     """Verify that speculative tokens are disabled when the batch size
     exceeds the threshold.
     """
     disable_by_batch_size = 3
-
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     worker = SpecDecodeWorker(proposer_worker=draft_worker,
                               scorer_worker=target_worker,
-                              rejection_sampler=rejection_sampler,
+                              spec_decode_sampler=mock_spec_decode_sampler(
+                                  acceptance_sampler_method),
                               metrics_collector=metrics_collector,
                               disable_by_batch_size=disable_by_batch_size)
 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index d1141d67c38f6..c7396dfd671f5 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -15,16 +15,16 @@
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=0)
     maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
     assert maybe_metrics is None
@@ -33,14 +33,14 @@ def test_initial_call_returns_none():
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -48,7 +48,7 @@ def test_second_call_returns_metrics():
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -61,16 +61,16 @@ def test_second_call_returns_metrics():
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=rank)
     _ = collector.maybe_collect_rejsample_metrics(k=5)
     metrics = collector.maybe_collect_rejsample_metrics(k=5)
@@ -80,14 +80,14 @@ def test_nonzero_rank_noop(rank):
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -96,7 +96,7 @@ def test_noop_until_time():
         collect_interval_s + 0.1, collect_interval_s + 0.1
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -127,14 +127,14 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
         num_draft_tokens, k)
 
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = num_draft_tokens
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = num_draft_tokens
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -142,7 +142,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index d83bf75e078d8..4cda6cf3c3205 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -8,6 +8,7 @@
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -90,6 +91,7 @@ def test_same_output_for_single_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
     worker = create_worker(
         Worker,
@@ -173,6 +175,7 @@ def test_same_output_for_multi_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
 
     worker = create_worker(
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index c896712fddde0..5908c7b3317a4 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -7,7 +7,6 @@
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
@@ -17,6 +16,7 @@
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
 if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
@@ -26,18 +26,20 @@
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_draft_model(k: int, batch_size: int):
+def test_correctly_calls_draft_model(k: int, batch_size: int,
+                                     acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the draft worker with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
-
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
@@ -58,15 +60,16 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_target_model(k: int, batch_size: int):
+def test_correctly_calls_target_model(k: int, batch_size: int,
+                                      acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the target model with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     draft_worker.device = 'cuda'
@@ -74,8 +77,9 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     worker.init_device()
 
     vocab_size = 32_000
@@ -138,8 +142,11 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
+                                             acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the rejection sampler with
     correct inputs. Everything else is mocked out.
     """
@@ -149,15 +156,14 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -204,15 +210,16 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     target_worker.execute_model.return_value = [target_output[0]]
 
     exception_secret = 'artificial stop'
-    rejection_sampler.side_effect = ValueError(exception_secret)
+
+    spec_decode_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    assert len(rejection_sampler.call_args_list) == 1
-    _, kwargs = rejection_sampler.call_args_list[0]
+    assert len(spec_decode_sampler.call_args_list) == 1
+    _, kwargs = spec_decode_sampler.call_args_list[0]
     actual = SimpleNamespace(**kwargs)
 
     assert torch.equal(actual.bonus_token_ids,
@@ -226,8 +233,11 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_formats_output(k: int, batch_size: int):
+def test_correctly_formats_output(k: int, batch_size: int,
+                                  acceptance_sampler_method: str):
     """Verify SpecDecodeWorker formats sampler output correctly.
     Everything else is mocked out.
     """
@@ -237,15 +247,13 @@ def test_correctly_formats_output(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
-
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -291,24 +299,23 @@ def test_correctly_formats_output(k: int, batch_size: int):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
 
-    rejection_sampler.return_value = rejection_sampler_output
-
+    spec_decode_sampler.return_value = spec_decode_sampler_output
     output = worker.execute_model(execute_model_req=ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k))
 
     expected_output = create_sampler_output_list(
-        token_ids=rejection_sampler_output.transpose(0, 1),
+        token_ids=spec_decode_sampler_output.transpose(0, 1),
         probs=[None for _ in range(k + 1)],
         logprobs=[None for _ in range(k + 1)])
 
@@ -355,8 +362,11 @@ def test_correctly_formats_output(k: int, batch_size: int):
 @pytest.mark.parametrize('k', [1, 2])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('returns_metrics', [True, False])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
+                          acceptance_sampler_method: str):
     """Verify SpecDecodeWorker collects metrics.
     """
     vocab_size = 32_000
@@ -365,15 +375,14 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -419,17 +428,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
-
-    rejection_sampler.return_value = rejection_sampler_output
+    spec_decode_sampler.return_value = spec_decode_sampler_output
 
     mock_rejsample_metrics = MagicMock(
         spec=SpecDecodeWorkerMetrics) if returns_metrics else None
@@ -450,15 +458,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
 @pytest.mark.parametrize('k', [0])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_k_equals_zero(k: int, batch_size: int):
+def test_k_equals_zero(k: int, batch_size: int,
+                       acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when k is zero. This happens during prefill.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -470,8 +479,9 @@ def test_k_equals_zero(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -492,16 +502,17 @@ def test_k_equals_zero(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [0, 5])
 @pytest.mark.parametrize('batch_size', [0])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_empty_input_batch(k: int, batch_size: int):
+def test_empty_input_batch(k: int, batch_size: int,
+                           acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when the input batch is empty. This can happen if the engine communicates
     to the workers information without scheduling a batch.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -513,8 +524,9 @@ def test_empty_input_batch(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -533,18 +545,19 @@ def test_empty_input_batch(k: int, batch_size: int):
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
-def test_init_device():
+def test_init_device(acceptance_sampler_method: str):
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
     well as other GPU initialization.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
 
     worker.init_device()
@@ -554,22 +567,23 @@ def test_init_device():
     target_worker.init_device.assert_called_once()
 
     metrics_collector.init_gpu_tensors.assert_called_once()
-    rejection_sampler.init_gpu_tensors.assert_called_once()
+    spec_decode_sampler.init_gpu_tensors.assert_called_once()
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_initialize_cache():
+def test_initialize_cache(acceptance_sampler_method):
     """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
     workers.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
     worker.initialize_cache(**kwargs)
@@ -582,19 +596,20 @@ def test_initialize_cache():
 @pytest.mark.parametrize('available_cpu_blocks', [500])
 @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
 @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
 def test_determine_num_available_blocks(available_gpu_blocks: int,
                                         available_cpu_blocks: int,
                                         target_cache_block_size_bytes: int,
-                                        draft_kv_size_bytes: int):
+                                        draft_kv_size_bytes: int,
+                                        acceptance_sampler_method: str):
     """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
     Specifically, it should run profiling in the scorer worker, and then evenly
     split the blocks between proposer and scorer worker.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     target_worker.determine_num_available_blocks.return_value = (
@@ -603,8 +618,9 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
         target_cache_block_size_bytes)
     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
 
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 118d789aa9a0b..3f6baa6999303 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,8 +1,12 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
 from vllm.spec_decode.util import split_batch_by_proposal_len
 
@@ -114,3 +118,21 @@ def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
 
     assert filtered_groups == []
     assert indices == []
+
+
+def mock_spec_decode_sampler(acceptance_sampler_method):
+    """
+    Returns either a RejectionSampler or TypicalAcceptanceSampler
+    object depending on whether acceptance_sampler_method is 
+    'rejection_sampler' or 'typical_acceptance_sampler' respectively.
+    """
+    if acceptance_sampler_method == "rejection_sampler":
+        sampler = MagicMock(spec=RejectionSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    elif acceptance_sampler_method == "typical_acceptance_sampler":
+        sampler = MagicMock(spec=TypicalAcceptanceSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    else:
+        raise ValueError(f"Invalid sampler name {acceptance_sampler_method}")
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index ce5b347832c30..68802f0b8468d 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -14,6 +14,7 @@
                            SequenceOutput)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker import Worker
 
 T = TypeVar("T", bound=Worker)
@@ -66,7 +67,8 @@ def create_worker(cls: Callable[..., T],
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
-                  enforce_eager: bool = True) -> T:
+                  enforce_eager: bool = True,
+                  model_runner_cls: Optional[ModelRunner] = None) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -89,6 +91,7 @@ def create_worker(cls: Callable[..., T],
         rank=0,
         distributed_init_method=distributed_init_method,
         is_driver_worker=is_driver_worker,
+        model_runner_cls=model_runner_cls,
     )
 
     worker.init_device()
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b674ea6a85c1..8203b5d2f960d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,8 @@
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
+from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
+                        get_open_port, merge_async_iterators)
 
 from .utils import error_on_warning
 
@@ -130,3 +131,61 @@ def test_get_open_port():
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                 s3.bind(("localhost", get_open_port()))
     os.environ.pop("VLLM_PORT")
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--image-input-type',
+                        choices=['pixel_values', 'image_features'])
+    parser.add_argument('--model-name')
+    parser.add_argument('--batch-size', type=int)
+    parser.add_argument('--enable-feature', action='store_true')
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(['--image_input_type', 'pixel_values'])
+    assert args.image_input_type == 'pixel_values'
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args([
+        '--image_input_type', 'image_features', '--model-name',
+        'facebook/opt-125m'
+    ])
+    assert args.image_input_type == 'image_features'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
+    assert args.image_input_type == 'pixel_values'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(['--batch_size', '32'])
+    assert args.batch_size == 32
+    args = parser.parse_args(['--batch-size', '32'])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(['--enable_feature'])
+    assert args.enable_feature is True
+    args = parser.parse_args(['--enable-feature'])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--image_input_type', 'invalid_choice'])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument('--required-arg', required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
new file mode 100644
index 0000000000000..8da4b0d9b99f9
--- /dev/null
+++ b/tests/tokenization/test_get_eos.py
@@ -0,0 +1,38 @@
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+:meth:`vllm.LLMEngine._get_eos_token_id`.
+"""
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if should_skip_test_group(group_name="TEST_TOKENIZATION"):
+    pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group",
+                allow_module_level=True)
+
+
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128009]
+
+
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index e18ee99494f65..c3c7fad878413 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,5 +1,7 @@
 import asyncio
 import os
+import sys
+from typing import List, Optional
 from unittest.mock import patch
 
 import pytest
@@ -105,3 +107,100 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
             max_num_seqs=1,
             max_input_length=None)
         tokenizer_pool.ping()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
+async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
+    """Test that Ray tokenizer pool group can recover from failures and
+    if that's not possible, mark itself as unhealthy."""
+
+    class FailingTokenizerGroup(TokenizerGroup):
+
+        def __init__(self,
+                     *args,
+                     fail_at: Optional[List[int]] = None,
+                     **kwargs):
+            super().__init__(*args, **kwargs)
+            self.i = 0
+            self.fail_at = fail_at or []
+
+        def encode(self, *args, **kwargs):
+            self.i += 1
+            if self.i in self.fail_at:
+                sys.exit(1)
+            return super().encode(*args, **kwargs)
+
+    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
+        _worker_cls = FailingTokenizerGroup
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Modify fail at to not fail at all (will be re-read when actor is
+    # re-initialized).
+    fail_at[0] = 1000
+
+    # We should recover successfully.
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+
+    # Check that we have a new actor
+    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
+    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+
+    # We should fail after re-initialization.
+    with pytest.raises(RuntimeError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt",
+                                                lora_request=None)
+
+    # check_health should raise the same thing
+    with pytest.raises(RuntimeError):
+        tokenizer_group_pool.check_health()
+
+    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
+    # cause a re-initialization.
+    fail_at = []
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=2,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Prompt too long error
+    with pytest.raises(ValueError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt" * 100,
+                                                lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    # Actors should stay the same.
+    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/tests/utils.py b/tests/utils.py
index bc30515c83100..09107b5e7e2b7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,7 +4,8 @@
 import time
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List
+from pathlib import Path
+from typing import Any, Dict, List
 
 import openai
 import ray
@@ -15,12 +16,33 @@
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.utils import get_open_port, is_hip
 
-if (not is_hip()):
+if is_hip():
+    from amdsmi import (amdsmi_get_gpu_vram_usage,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down)
+
+    @contextmanager
+    def _nvml():
+        try:
+            amdsmi_init()
+            yield
+        finally:
+            amdsmi_shut_down()
+else:
     from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit)
+                        nvmlInit, nvmlShutdown)
 
-# Path to root of repository so that utilities can be imported by ray workers
-VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+    @contextmanager
+    def _nvml():
+        try:
+            nvmlInit()
+            yield
+        finally:
+            nvmlShutdown()
+
+
+VLLM_PATH = Path(__file__).parent.parent
+"""Path to root of the vLLM repository."""
 
 
 class RemoteOpenAIServer:
@@ -129,13 +151,15 @@ def init_test_distributed_environment(
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
-def multi_process_tensor_parallel(
+def multi_process_parallel(
     tp_size: int,
     pp_size: int,
-    test_target,
+    test_target: Any,
 ) -> None:
     # Using ray helps debugging the error when it failed
     # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
     ray.init(runtime_env={"working_dir": VLLM_PATH})
 
     distributed_init_port = get_open_port()
@@ -160,20 +184,25 @@ def error_on_warning():
         yield
 
 
+@_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                  threshold_bytes: int,
                                  timeout_s: float = 120) -> None:
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
-    nvmlInit()
     start_time = time.time()
     while True:
         output: Dict[int, str] = {}
         output_raw: Dict[int, float] = {}
         for device in devices:
-            dev_handle = nvmlDeviceGetHandleByIndex(device)
-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
-            gb_used = mem_info.used / 2**30
+            if is_hip():
+                dev_handle = amdsmi_get_processor_handles()[device]
+                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
+                gb_used = mem_info["vram_used"] / 2**10
+            else:
+                dev_handle = nvmlDeviceGetHandleByIndex(device)
+                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+                gb_used = mem_info.used / 2**30
             output_raw[device] = gb_used
             output[device] = f'{gb_used:.02f}'
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
new file mode 100644
index 0000000000000..c1cbbd33b5c3b
--- /dev/null
+++ b/tests/worker/test_model_input.py
@@ -0,0 +1,158 @@
+import dataclasses
+from typing import List, Tuple, Type
+
+import pytest
+import torch
+
+from tests.nm_utils.utils_skip import should_skip_test_group
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.worker.embedding_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+if should_skip_test_group(group_name="TEST_WORKER"):
+    pytest.skip("TEST_WORKER=DISABLE, skipping worker test group",
+                allow_module_level=True)
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
+
+
+def test_embedding_model_runner_input():
+    pooling_metadata = PoolingMetadata(
+        seq_groups=[[0]],
+        seq_data={},
+        prompt_lens=[1],
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        pooling_metadata=pooling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithPoolingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # Pooling metadata is not broadcast.
+    assert received_model_input.pooling_metadata is None
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 62c0bf5fbc518..49e6a4ed461e2 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -66,12 +66,13 @@ def test_prepare_prompt(batch_size):
         expected_selected_token_indices.append(selected_token_start_idx +
                                                seq_len - 1)
         selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
     attn_metadata = model_input.attn_metadata
     return_seq_lens = model_input.seq_lens
-    slot_mapping = model_input.slot_mapping
+    slot_mapping = attn_metadata.slot_mapping
     assert return_seq_lens == seq_lens
     assert len(slot_mapping) == len(input_tokens)
 
@@ -179,10 +180,11 @@ def test_prepare_decode_cuda_graph(batch_size):
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
 
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata, slot_mapping = (
         model_input.input_tokens, model_input.input_positions,
-        model_input.attn_metadata, model_input.slot_mapping)
+        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
     expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
@@ -264,32 +266,29 @@ def test_empty_seq_group():
         enforce_eager=False,
     )
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata, slot_mapping = (
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata = (
         model_input.input_tokens,
         model_input.input_positions,
         model_input.attn_metadata,
-        model_input.slot_mapping,
     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata, slot_mapping,
-     return_seq_lens) = (
-         model_input.input_tokens,
-         model_input.input_positions,
-         model_input.attn_metadata,
-         model_input.slot_mapping,
-         model_input.seq_lens,
-     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+        model_input.seq_lens,
+    )
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-    assert len(return_seq_lens) == 0
+    assert return_seq_lens is None
 
 
 @pytest.fixture
@@ -358,8 +357,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
         seq_group_metadata_list.append(seq_group_metadata)
         decode_metadata_list.append(seq_group_metadata)
 
-    (input_tokens, input_positions, attn_metadata, _, _, _,
-     _) = model_runner.prepare_input_tensors(seq_group_metadata_list)
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+    )
 
     prefill_meta_actual = attn_metadata.prefill_metadata
     decode_meta_actual = attn_metadata.decode_metadata
@@ -372,7 +375,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
 
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input(
+    attn_metadata = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list).attn_metadata
 
     for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e050c1172acb5..479ea08e49072 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -220,9 +220,12 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
-def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
+                      out_dtype: Type[torch.dtype],
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
 
@@ -230,7 +233,8 @@ def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+
     return out
 
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 6396103bf5efa..40768532f59c2 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -21,9 +21,13 @@ def get_impl_cls() -> Type["AttentionImpl"]:
 
     @staticmethod
     @abstractmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
         raise NotImplementedError
 
+    @classmethod
+    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
+        return cls.get_metadata_cls()(*args, **kwargs)
+
     @staticmethod
     @abstractmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index dce2b83615b7a..7b4578fcd8b9d 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -90,8 +90,8 @@ def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
         return BlocksparseFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata":
-        return BlocksparseFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return BlocksparseFlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 1c48e2a0bb33d..8cb5c3101a804 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -25,8 +25,8 @@ def get_impl_cls() -> Type["FlashAttentionImpl"]:
         return FlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata":
-        return FlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7b7959d257fac..4ecac7379c7f6 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,10 +1,16 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
-import flashinfer
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    from vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None
+    BatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+
 import torch
-from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-from vllm_flash_attn import flash_attn_varlen_func
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -22,8 +28,8 @@ def get_impl_cls() -> Type["FlashInferImpl"]:
         return FlashInferImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashInferMetadata":
-        return FlashInferMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashInferMetadata
 
     @staticmethod
     def get_kv_cache_shape(
@@ -60,19 +66,16 @@ class FlashInferMetadata(AttentionMetadata):
     # requests only.
     max_prefill_seq_len: int
 
-    use_cuda_graph: bool = False
+    use_cuda_graph: bool = True
 
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
 
-    # Metadata for the prefill stage since we still
-    # use flash attention for prefill.
+    # Metadata for the prefill stage
     seq_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
-    # Metadata for the decode stage
-    # Workspace buffer required by the kernel, the buffer should not
-    # be allocated/deacollated by the FalshInfermetadata object.
-    workspace_buffer: Optional[torch.Tensor] = None
     # An example for paged_kv_indices, paged_kv_indptr:
     # request 1, page indices [0, 5, 8]
     # request 2, page indices [1, 6, 7]
@@ -98,6 +101,7 @@ class FlashInferMetadata(AttentionMetadata):
     page_size: Optional[int] = None
     # The data type of the paged kv cache
     data_type: torch.dtype = None
+    device: torch.device = torch.device("cuda")
 
     def __post_init__(self):
         # Refer to
@@ -109,13 +113,35 @@ def __post_init__(self):
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
 
-        # When using flashinfer, we are also creating the FlashInferMetadata,
-        # which will also call post_init by default, here we want to skip the
-        # post_init if it's the prefill phase.
-        if self.num_prefills == 0:
-            assert self.num_decode_tokens > 0
-            self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-                self.workspace_buffer, "NHD")
+    def begin_forward(self):
+        if self.num_prefill_tokens > 0:
+            if self.paged_kv_indices is None:
+                return
+
+            assert self.prefill_wrapper is not None
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            self.prefill_wrapper.begin_forward(
+                self.query_start_loc, self.paged_kv_indptr,
+                self.paged_kv_indices, self.paged_kv_last_page_len,
+                self.num_qo_heads, self.num_kv_heads, self.head_dim,
+                self.page_size)
+        else:
+            if not self.use_cuda_graph:
+                assert self.paged_kv_indices is not None
+                assert self.paged_kv_indptr is not None
+                assert self.paged_kv_last_page_len is not None
+                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                    self.device)
+
+            assert self.decode_wrapper is not None
             self.decode_wrapper.begin_forward(
                 self.paged_kv_indptr,
                 self.paged_kv_indices,
@@ -133,8 +159,9 @@ def asdict_zerocopy(self,
                         ) -> Dict[str, Any]:
         if skip_fields is None:
             skip_fields = set()
-        # We need to skip the decode_wrapper field since it cannot be
+        # We need to skip the prefill/decode_wrapper field since it cannot be
         # broadcasted with nccl when TP is enabled.
+        skip_fields.add('prefill_wrapper')
         skip_fields.add('decode_wrapper')
         return super().asdict_zerocopy(skip_fields)
 
@@ -168,6 +195,7 @@ def __init__(
         alibi_slopes: Optional[List[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -217,10 +245,14 @@ def forward(
                 self.kv_cache_dtype,
             )
 
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
         if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            assert prefill_meta.block_tables is not None
-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache is None:
                 output = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -235,13 +267,14 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                 )
             else:
-                raise NotImplementedError(
-                    "Prefix caching is not supported with flashinfer yet.")
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+                output = prefill_meta.prefill_wrapper.forward(query,
+                                                              kv_cache,
+                                                              causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
-            query = query.contiguous(
-            )  # Flashinfer requires query to be contiguous
             output = attn_metadata.decode_metadata.decode_wrapper.forward(
                 query,
                 kv_cache,
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f09b24f2a0304..5114bfa6e1589 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -25,8 +25,8 @@ def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
         return IpexAttnBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "IpexAttnMetadata":
-        return IpexAttnMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
+        return IpexAttnMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
new file mode 100644
index 0000000000000..0f21b50ad4dc7
--- /dev/null
+++ b/vllm/attention/backends/openvino.py
@@ -0,0 +1,101 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import openvino as ov
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
+
+
+class OpenVINOAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "openvino"
+
+    @staticmethod
+    def get_impl_cls():
+        # OpenVINO implements PagedAttention as part of the Optimum
+        # exported model
+        raise NotImplementedError
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+        raise NotImplementedError
+
+    @staticmethod
+    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
+        return OpenVINOAttentionMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, num_kv_heads, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: ov.Tensor,
+        dst_kv_cache: ov.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        # OpenVINO currently supports only CPU, which does not require
+        # swap of KV cache blocks
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        src_to_dists: List[Tuple[int, int]],
+    ) -> None:
+        for src, dst in src_to_dists:
+            for key_cache, value_cache in kv_caches:
+                key_cache.data[dst, :] = key_cache.data[src, :]
+                value_cache.data[dst, :] = value_cache.data[src, :]
+
+
+@dataclass
+class OpenVINOAttentionMetadata:
+    """Metadata for OpenVINOAttentionBackend.
+
+    Basic terms used below:
+    - batch_size_in_sequences - total number of sequences to execute​
+    - prompt_lens – per sequence size number of scheduled tokens​
+    - batch_size_in_tokens = sum(prompt_lens)​
+    - max_context_len = max(context_lens)​
+    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​
+    - num_blocks – total number of blocks in block_indices​
+    """
+
+    # Describes past KV cache size for each sequence within a batch
+    # Shape: [batch_size_in_sequences]
+    # Type: i32​
+    past_lens: torch.Tensor
+
+    # Describes start indices of input / speculative tokens from
+    # current sequences within a batch sequence​
+    # Shape: [batch_size_in_sequences + 1]​
+    # Type: i32
+    subsequence_begins: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # indices along 0th dimension in key_cache and value_cache inputs​
+    # Shape: [num_blocks]
+    # Type: i32​
+    block_indices: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # for i-th element, it is an index in block_indices with the
+    # first block belonging to i-th sequence​
+    # Shape: [batch_size_in_sequences + 1]
+    # Type: i32​
+    block_indices_begins: torch.Tensor
+
+    # Describes max context length
+    # Shape: scalar
+    # Type: i32
+    max_context_len: torch.Tensor
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b203c5ec54c92..22cb1a1bd1fd3 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -16,8 +16,8 @@ def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
         return PallasAttentionBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "PallasMetadata":
-        return PallasMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["PallasMetadata"]:
+        return PallasMetadata
 
     @staticmethod
     def get_kv_cache_shape(
@@ -32,17 +32,22 @@ def get_kv_cache_shape(
     def swap_blocks(
         src_kv_cache: torch.Tensor,
         dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
+        src_to_dst: torch.Tensor,
     ) -> None:
-        raise NotImplementedError("swap_blocks is not implemented.")
+        raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
+    @torch.compile(backend="openxla")
     @staticmethod
     def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
     ) -> None:
-        # TODO(woosuk): Implement this.
-        raise NotImplementedError("copy_blocks is not implemented.")
+        src_indices, dst_indices = src_to_dists
+        for k_cache, v_cache in kv_caches:
+            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
+            k_cache[:, dst_indices] = k_cache[:, src_indices]
+            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
+            v_cache[:, dst_indices] = v_cache[:, src_indices]
 
 
 @dataclass
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 9294068c64d1a..81fabdbdfc83c 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -25,8 +25,8 @@ def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]:
         return ROCmFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata":
-        return ROCmFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return ROCmFlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c01e0a0a3a19c..63f8466da9316 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -31,8 +31,8 @@ def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
         return TorchSDPABackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata":
-        return TorchSDPAMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TorchSDPAMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 0fecd9f6e610c..ff449c3ff74f8 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -28,8 +28,8 @@ def get_impl_cls() -> Type["XFormersImpl"]:
         return XFormersImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "XFormersMetadata":
-        return XFormersMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return XFormersMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index db55a31476fed..dfe93be462184 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -9,6 +9,7 @@
 from vllm.config import CacheConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
 
 
 class Attention(nn.Module):
@@ -56,15 +57,19 @@ def __init__(
         quant_method = quant_config.get_quant_method(
             self) if quant_config else None
         if quant_method is not None:
-            if self.kv_cache_dtype == "fp8_e5m2":
-                raise ValueError("fp8_e5m2 kv-cache is not supported with "
-                                 "fp8 checkpoints.")
-            # When FP8 quantization is enabled, we make a parameter
-            # "kv_scale" so that it can be loaded from FP8 checkpoint.
-            # The kv_scale will then be converted back
-            # to self._kv_scale in a native float32 value after weight loading.
-            self.quant_method = quant_method
-            self.quant_method.create_weights(self)
+            assert isinstance(quant_method, Fp8KVCacheMethod)
+            # TODO (mgoin): kv cache dtype should be specified in the FP8
+            # checkpoint config and become the "auto" behavior
+            if "fp8" in self.kv_cache_dtype:
+                if self.kv_cache_dtype == "fp8_e5m2":
+                    raise ValueError("fp8_e5m2 kv-cache is not supported with "
+                                     "fp8 checkpoints.")
+                # When FP8 quantization is enabled, we make a parameter
+                # "kv_scale" so that it can be loaded from FP8 checkpoint.
+                # The kv_scale will then be converted back to self._kv_scale
+                # in a native float32 value after weight loading.
+                self.quant_method = quant_method
+                self.quant_method.create_weights(self)
 
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 300211e70bb79..637cfda214964 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -2,13 +2,13 @@
 
 import torch
 
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import get_device_capability_stateless, is_cpu, is_hip
 
 from .utils import (dense_to_crow_col, get_head_sliding_step,
                     get_sparse_attn_mask)
 
 IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
-                         and torch.cuda.get_device_capability()[0] >= 8)
+                         and get_device_capability_stateless()[0] >= 8)
 
 if IS_COMPUTE_8_OR_ABOVE:
     from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
@@ -235,4 +235,4 @@ def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
                                 v,
                                 cu_seqlens_k,
                                 cu_seqlens_q=cu_seqlens_q,
-                                sm_scale=sm_scale)
\ No newline at end of file
+                                sm_scale=sm_scale)
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index b99cf9a50d105..ca9f28fcb7013 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import get_device_capability_stateless
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -683,7 +685,7 @@ def context_attention_fwd(q,
                               alibi_slopes=None,
                               sliding_window=None):
 
-        cap = torch.cuda.get_device_capability()
+        cap = get_device_capability_stateless()
         BLOCK = 128 if cap[0] >= 8 else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1d56d87ccd119..851bf52a505ee 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
+from vllm.utils import is_cpu, is_hip, is_openvino, is_tpu, is_xpu
 
 logger = init_logger(__name__)
 
@@ -17,6 +17,7 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
+    OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -64,6 +65,10 @@ def get_attn_backend(
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.OPENVINO:
+        logger.info("Using OpenVINO Attention backend.")
+        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
+        return OpenVINOAttentionBackend
     elif backend == _Backend.IPEX:
         assert is_xpu(), RuntimeError(
             "IPEX attention backend is only used for the XPU device.")
@@ -72,8 +77,9 @@ def get_attn_backend(
         return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
-        logger.warning("Eager mode is required for the Flashinfer backend. "
-                       "Please make sure --enforce-eager is set.")
+        logger.warning(("Flashinfer will be stuck on llma-2-7b,"
+                        " please avoid using Flashinfer as the"
+                        "backend when running on llma-2-7b."))
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
     elif backend == _Backend.PALLAS:
@@ -113,6 +119,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_openvino():
+        if selected_backend != _Backend.OPENVINO:
+            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
+        return _Backend.OPENVINO
+
     if is_xpu():
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
diff --git a/vllm/block.py b/vllm/block.py
index bd00c07adc0d7..0b8ef7d4b73d9 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -1,90 +1,10 @@
 """Token blocks."""
-import weakref
-from collections import defaultdict
-from typing import Dict, List
+from typing import List
 
 from vllm.utils import Device
 
-_BLANK_TOKEN_ID = -1
-
 DEFAULT_LAST_ACCESSED_TIME = -1
 
-TokensBlock = List[int]
-
-
-class BlockPool:
-    """A pool of logical blocks.
-    When requests come, we create a lot of logical blocks;
-    when requests are done, we destroy a lot of logical blocks.
-    It turns out that creating and destroying logical blocks can be expensive,
-    especially for the `token_ids` field, which is a list of integers.
-    To avoid this overhead, we use a pool to manage the logical blocks.
-    When an old request is done and a new request comes, we can reuse the
-    logical blocks from the old request to feed the new request.
-    """
-
-    def __init__(self) -> None:
-        # block size to list of token blocks
-        self.pool: Dict[int, List[TokensBlock]] = defaultdict(list)
-
-    def alloc_block(self, block_size: int) -> TokensBlock:
-        if block_size in self.pool and self.pool[block_size]:
-            return self.pool[block_size].pop()
-        return [_BLANK_TOKEN_ID] * block_size
-
-    def del_block(self, block: TokensBlock) -> None:
-        self.pool[len(block)].append(block)
-
-
-_BLOCK_POOL = BlockPool()
-
-
-class LogicalTokenBlock:
-    """A block that stores a contiguous chunk of tokens from left to right.
-
-    Logical blocks are used to represent the states of the corresponding
-    physical blocks in the KV cache.
-    """
-
-    def __init__(
-        self,
-        block_number: int,
-        block_size: int,
-    ) -> None:
-        self.block_number = block_number
-        self.block_size = block_size
-
-        self.token_ids = _BLOCK_POOL.alloc_block(block_size)
-        # this finalizer is used to return the block to the pool when the object is deleted # noqa
-        # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa
-        # i.e. `self.token_ids` may be deleted before `self`, and we lose
-        #  the opportunity to return the block to the pool
-        self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block,
-                                           self.token_ids)
-        self.num_tokens = 0
-
-    def is_empty(self) -> bool:
-        return self.num_tokens == 0
-
-    def get_num_empty_slots(self) -> int:
-        return self.block_size - self.num_tokens
-
-    def is_full(self) -> bool:
-        return self.num_tokens == self.block_size
-
-    def append_tokens(self, token_ids: List[int]) -> None:
-        assert len(token_ids) <= self.get_num_empty_slots()
-        curr_idx = self.num_tokens
-        self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
-        self.num_tokens += len(token_ids)
-
-    def get_token_ids(self) -> List[int]:
-        return self.token_ids[:self.num_tokens]
-
-    def get_last_token_id(self) -> int:
-        assert self.num_tokens > 0
-        return self.token_ids[self.num_tokens - 1]
-
 
 class PhysicalTokenBlock:
     """Represents the state of a block in the KV cache."""
diff --git a/vllm/config.py b/vllm/config.py
index fb2cbe93ed740..bb13b6be02dde 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -7,13 +7,15 @@
 import torch
 from transformers import PretrainedConfig, PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu, is_xpu)
+                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
+                        print_warning_once, update_environment_variables)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -109,6 +111,7 @@ def __init__(
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
+        multimodal_config: Optional["VisionLanguageConfig"] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -143,6 +146,17 @@ def __init__(
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+
+        if (not self.disable_sliding_window
+                and self.hf_text_config.model_type == "gemma2"
+                and self.hf_text_config.sliding_window is not None):
+            print_warning_once(
+                "Gemma 2 uses sliding window attention for every odd layer, "
+                "which is currently not supported by vLLM. Disabling sliding "
+                "window and capping the max length to the sliding window size "
+                f"({self.hf_text_config.sliding_window}).")
+            self.disable_sliding_window = True
+
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             max_model_len=max_model_len,
@@ -150,6 +164,8 @@ def __init__(
             sliding_window_len=self.get_hf_config_sliding_window())
         self.served_model_name = get_served_model_name(model,
                                                        served_model_name)
+        self.multimodal_config = multimodal_config
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_embedding_mode()
@@ -287,8 +303,7 @@ def verify_with_parallel_config(
                 "BitAndBytes quantization with TP or PP is not supported yet.")
 
     def get_hf_config_sliding_window(self) -> Optional[int]:
-        """Get the sliding window size, or None if disabled.
-        """
+        """Get the sliding window size, or None if disabled."""
 
         # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
         # addition to sliding window size. We check if that field is present
@@ -314,6 +329,12 @@ def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
     def get_head_size(self) -> int:
+        # TODO remove hard code
+        if hasattr(self.hf_text_config, "model_type"
+                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+            # FlashAttention supports only head_size 32, 64, 128, 256,
+            # we need to pad head_size 192 to 256
+            return 256
         if hasattr(self.hf_text_config, "head_dim"):
             return self.hf_text_config.head_dim
         # FIXME(woosuk): This may not be true for all models.
@@ -666,6 +687,12 @@ def __init__(
             self.distributed_executor_backend = backend
             logger.info("Defaulting to use %s for distributed inference",
                         backend)
+        # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init,
+        # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by
+        # the update_environment_variables function)
+        if is_hip() and envs.CUDA_VISIBLE_DEVICES:
+            update_environment_variables(
+                {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
 
         self._verify_args()
 
@@ -758,7 +785,6 @@ def __init__(self,
         self.chunked_prefill_enabled = enable_chunked_prefill
         self.embedding_mode = embedding_mode
         self.preemption_mode = preemption_mode
-
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -792,6 +818,8 @@ def __init__(self, device: str = "auto") -> None:
             # Automated device type detection
             if is_neuron():
                 self.device_type = "neuron"
+            elif is_openvino():
+                self.device_type = "openvino"
             elif is_tpu():
                 self.device_type = "tpu"
             elif is_cpu():
@@ -807,7 +835,7 @@ def __init__(self, device: str = "auto") -> None:
             self.device_type = device
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
+        if self.device_type in ["neuron", "openvino"]:
             self.device = torch.device("cpu")
         elif self.device_type in ["tpu"]:
             self.device = None
@@ -829,6 +857,7 @@ def maybe_create_spec_config(
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_draft_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -836,6 +865,9 @@ def maybe_create_spec_config(
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: Optional[float],
+        typical_acceptance_sampler_posterior_alpha: Optional[float],
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -851,6 +883,8 @@ def maybe_create_spec_config(
             target_dtype (str): The data type used for the target model.
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
+            speculative_draft_tensor_parallel_size (Optional[int]): The degree
+                of the tensor parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -870,7 +904,20 @@ def maybe_create_spec_config(
                 window, if provided.
             ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
                 window, if provided.
-
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
+    
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
                 the necessary conditions are met, else None.
@@ -941,15 +988,19 @@ def maybe_create_spec_config(
                 max_logprobs=target_model_config.max_logprobs,
             )
 
-            if (draft_model_config.hf_config.model_type == "mlp_speculator"
+            draft_hf_config = draft_model_config.hf_config
+            if (draft_hf_config.model_type == "mlp_speculator"
                     and target_parallel_config.world_size != 1):
                 # MLPSpeculator TP support will be added very soon
                 raise ValueError(
                     "Speculative decoding with mlp_speculator models does not "
                     "yet support distributed inferencing (TP > 1).")
 
-            n_predict = getattr(draft_model_config.hf_config, "n_predict",
-                                None)
+            if (num_speculative_tokens is not None
+                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
+                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
+
+            n_predict = getattr(draft_hf_config, "n_predict", None)
             if n_predict is not None:
                 if num_speculative_tokens is None:
                     # Default to max value defined in draft model config.
@@ -958,9 +1009,9 @@ def maybe_create_spec_config(
                     # Verify provided value doesn't exceed the maximum
                     # supported by the draft model.
                     raise ValueError(
-                        "Expected both speculative_model and "
-                        "num_speculative_tokens to be provided, but found "
-                        f"{speculative_model=} and {num_speculative_tokens=}.")
+                        "This speculative model supports a maximum of "
+                        f"num_speculative_tokens={n_predict}, but "
+                        f"{num_speculative_tokens=} was provided.")
 
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
@@ -971,7 +1022,8 @@ def maybe_create_spec_config(
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -979,6 +1031,11 @@ def maybe_create_spec_config(
                 "speculative_model unless the draft model config contains an "
                 "n_predict parameter.")
 
+        if typical_acceptance_sampler_posterior_threshold is None:
+            typical_acceptance_sampler_posterior_threshold = 0.09
+        if typical_acceptance_sampler_posterior_alpha is None:
+            typical_acceptance_sampler_posterior_alpha = 0.3
+
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
@@ -986,6 +1043,11 @@ def maybe_create_spec_config(
             speculative_disable_by_batch_size,
             ngram_prompt_lookup_max,
             ngram_prompt_lookup_min,
+            draft_token_acceptance_method=draft_token_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=\
+                typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=\
+                typical_acceptance_sampler_posterior_alpha,
         )
 
     @staticmethod
@@ -1025,16 +1087,26 @@ def _maybe_override_draft_max_model_len(
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: Optional[int]
+    ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+        if speculative_draft_tensor_parallel_size is None:
+            speculative_draft_tensor_parallel_size = \
+                  target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size != 1:
+            # TODO(wooyeon): allow tp values larger than 1
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"other value than 1")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.
@@ -1057,6 +1129,9 @@ def __init__(
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
     ):
         """Create a SpeculativeConfig object.
 
@@ -1070,6 +1145,19 @@ def __init__(
                 enqueue requests is larger than this value.
             ngram_prompt_lookup_max: Max size of ngram token window.
             ngram_prompt_lookup_min: Min size of ngram token window.
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
@@ -1078,6 +1166,11 @@ def __init__(
             speculative_disable_by_batch_size
         self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
+        self.draft_token_acceptance_method = draft_token_acceptance_method
+        self.typical_acceptance_sampler_posterior_threshold = \
+            typical_acceptance_sampler_posterior_threshold
+        self.typical_acceptance_sampler_posterior_alpha = \
+            typical_acceptance_sampler_posterior_alpha
 
         self._verify_args()
 
@@ -1089,6 +1182,31 @@ def _verify_args(self) -> None:
         if self.draft_model_config:
             self.draft_model_config.verify_with_parallel_config(
                 self.draft_parallel_config)
+            # Validate and set draft token acceptance related settings.
+
+        if (self.draft_token_acceptance_method is None):
+            raise ValueError("draft_token_acceptance_method is not set. "
+                             "Expected values are rejection_sampler or "
+                             "typical_acceptance_sampler.")
+
+        if (self.draft_token_acceptance_method != 'rejection_sampler'
+                and self.draft_token_acceptance_method !=
+                'typical_acceptance_sampler'):
+            raise ValueError(
+                "Expected draft_token_acceptance_method to be either "
+                "rejection_sampler or typical_acceptance_sampler. Instead it "
+                f"is {self.draft_token_acceptance_method}")
+
+        if (self.typical_acceptance_sampler_posterior_threshold < 0
+                or self.typical_acceptance_sampler_posterior_alpha < 0):
+            raise ValueError(
+                "Expected typical_acceptance_sampler_posterior_threshold "
+                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
+                "Instead found "
+                f"typical_acceptance_sampler_posterior_threshold = "
+                f"{self.typical_acceptance_sampler_posterior_threshold} and "
+                f"typical_acceptance_sampler_posterior_alpha = "
+                f"{self.typical_acceptance_sampler_posterior_alpha}")
 
     @property
     def num_lookahead_slots(self) -> int:
@@ -1262,10 +1380,16 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                # Following the common practice, we use float16 for float32
-                # models.
-                logger.info("Casting torch.float32 to torch.float16.")
-                torch_dtype = torch.float16
+                if config.model_type == "gemma2":
+                    logger.info(
+                        "For Gemma 2, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16.")
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
         else:
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 4010aaf02b828..995ea04a5b3d6 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -262,8 +262,7 @@ def __init__(
         self.cross_block_tables: Dict[str, BlockTable] = {}
 
     def _get_seq_num_required_blocks(self, seq: Sequence) -> int:
-        return 0 if seq is None \
-            else len(seq.logical_token_blocks)
+        return 0 if seq is None else seq.n_blocks
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -298,7 +297,7 @@ def _allocate_sequence(self, \
                            ref_count: int, \
                            is_encoder_decoder: bool = True) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
-        num_prompt_blocks = len(seq.logical_token_blocks)
+        num_prompt_blocks = seq.n_blocks
 
         block_table: BlockTable = []
         for logical_idx in range(num_prompt_blocks):
@@ -367,7 +366,7 @@ def _promote_last_block(
 
         # Compute a new hash for the block so that it can be shared by other
         # Sequences
-        new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+        new_hash = seq.hash_of_block(seq.n_blocks - 1)
 
         # if new_hash is already in the cached table, then free last_block
         # and return the cached version
@@ -407,10 +406,10 @@ def _allocate_last_physical_block(
         if not self.enable_caching:
             return self.gpu_allocator.allocate()
         block_hash: Optional[int] = None
+        n_blocks = seq.n_blocks
         if (self._is_last_block_full(seq)):
-            block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
-        num_hashed_tokens = seq.num_hashed_tokens_of_block(
-            len(seq.logical_token_blocks) - 1)
+            block_hash = seq.hash_of_block(n_blocks - 1)
+        num_hashed_tokens = seq.num_hashed_tokens_of_block(n_blocks - 1)
 
         # num_hashed_tokens is used to compute future hashes
         # (e.g. in the hashing function, it is used to ask the sequence for
@@ -429,12 +428,12 @@ def append_slots(
         num_lookahead_slots: int = 0,
     ) -> List[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
-        logical_blocks = seq.logical_token_blocks
+        n_blocks = seq.n_blocks
         block_table = self.block_tables[seq.seq_id]
         # If we need to allocate a new physical block
-        if len(block_table) < len(logical_blocks):
+        if len(block_table) < n_blocks:
             # Currently this code only supports adding one physical block
-            assert len(block_table) == len(logical_blocks) - 1
+            assert len(block_table) == n_blocks - 1
 
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index b0cb21a022789..a303d0bd25a3c 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -11,66 +11,18 @@
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import is_in_the_same_node
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import cuda_device_count_stateless, is_full_nvlink
 
 try:
-    import pynvml
-
-    # Simulate ImportError if custom_ar ops are not supported.
-    if not ops.is_custom_op_supported("_C_custom_ar::meta_size"):
-        raise ImportError("custom_ar", __file__)
-
+    assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
     custom_ar = True
-
-    @contextmanager
-    def _nvml():
-        try:
-            pynvml.nvmlInit()
-            yield
-        finally:
-            pynvml.nvmlShutdown()
-
-except ImportError:
-    # For AMD GPUs
+except Exception:
+    # For AMD GPUs and CPUs
     custom_ar = False
-    pynvml = None
-
-    @contextmanager
-    def _nvml():
-        try:
-            yield
-        finally:
-            pass
-
 
 logger = init_logger(__name__)
 
 
-@_nvml()
-def _is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
-    so it works on real physical device ids.
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-
-
 def _can_p2p(rank: int, world_size: int) -> bool:
     for i in range(world_size):
         if i == rank:
@@ -161,7 +113,7 @@ def __init__(self,
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        full_nvlink = _is_full_nvlink(physical_device_ids)
+        full_nvlink = is_full_nvlink(physical_device_ids)
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index d3e41fa710676..6f1aaed9881a2 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -13,7 +13,8 @@
 import vllm.envs as envs
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -24,7 +25,8 @@ def producer(batch_src: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for i in batch_src:
@@ -56,7 +58,8 @@ def consumer(batch_tgt: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for j in batch_tgt:
@@ -123,7 +126,7 @@ def can_actually_p2p(
     processes for testing all pairs of GPUs in batch. The trick is to reset
     the device after each test (which is not available in PyTorch).
     """  # noqa
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 83eec264b6f81..7319566545678 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -121,10 +121,7 @@ def all_reduce(self,
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
 
-    def send(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             stream=None):
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -132,16 +129,11 @@ def send(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if dst is None:
-            dst = (self.rank + 1) % self.world_size
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
-    def recv(self,
-             tensor: torch.Tensor,
-             src: Optional[int] = None,
-             stream=None):
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -149,8 +141,6 @@ def recv(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if src is None:
-            src = (self.rank - 1) % self.world_size
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index c44bd2f11ee8b..bea205882d9d8 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -14,6 +14,12 @@
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
+# time to wait if the queue is full or empty
+# if we sleep for too short, it will consume too much CPU
+# if we sleep for too long, it will slow down the writer/reader
+# 0.1 us is a good balance
+RINGBUFFER_SLEEP_INTERVAL = 1e-7
+
 logger = init_logger(__name__)
 
 
@@ -145,8 +151,7 @@ def __init__(self, buffer: ShmRingBuffer, reader_rank: int):
     @contextmanager
     def acquire_write(self):
         assert self._is_writer, "Only writers can acquire write"
-        start_index = self.current_idx
-        start_time = time.time()
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
@@ -154,19 +159,21 @@ def acquire_write(self):
                 written_flag = metadata_buffer[0]
                 if written_flag and read_count != self.buffer.n_reader:
                     # this block is written and not read by all readers
-                    # try to write to the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no empty block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if time.monotonic(
+                    ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is either
                 # (1) not written
@@ -188,13 +195,14 @@ def acquire_write(self):
                     metadata_buffer[i] = 0
                 # mark the block as written
                 metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     @contextmanager
     def acquire_read(self):
         assert self._is_reader, "Only readers can acquire read"
-        start_index = self.current_idx
-        start_time = time.time()
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
@@ -204,19 +212,22 @@ def acquire_read(self):
                     # this block is either
                     # (1) not written
                     # (2) already read by this reader
-                    # try to read the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if time.monotonic(
+                    ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is not read by this reader
                 # let caller read from the buffer
@@ -226,6 +237,8 @@ def acquire_read(self):
                 # caller has read from the buffer
                 # set the read flag
                 metadata_buffer[self.reader_rank + 1] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     def enqueue(self, obj):
@@ -255,6 +268,7 @@ def broadcast_object(self, obj=None):
         else:
             return self.dequeue()
 
+    @staticmethod
     def create_from_process_group(pg: ProcessGroup,
                                   max_chunk_bytes,
                                   max_chunks,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188fadbb92a5..4ebb8703e0f44 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,6 +20,7 @@
  steps.
 """
 import contextlib
+import pickle
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
@@ -28,6 +29,7 @@
 from unittest.mock import patch
 
 import torch
+import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
@@ -43,16 +45,22 @@ class GraphCaptureContext:
 
 
 def _split_tensor_dict(
-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
+
+    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
+    metadata will be "key1%key2".
     """
-    metadata_list = []
+    metadata_list: List[Tuple[str, Any]] = []
     tensor_list = []
     for key, value in tensor_dict.items():
+        assert "%" not in key, (
+            "Avoid having '%' in key "
+            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
@@ -60,13 +68,31 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
+                (prefix + key, TensorMetadata(device, value.dtype,
+                                              value.size())))
             tensor_list.append(value)
+        elif isinstance(value, dict):
+            if len(value) == 0:
+                metadata_list.append((prefix + key, value))
+            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
+                value, prefix + key + "%")
+            metadata_list.extend(inner_metadata_list)
+            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((key, value))
+            metadata_list.append((prefix + key, value))
     return metadata_list, tensor_list
 
 
+def _update_nested_dict(nested_dict, flattened_key, value):
+    key_splits = flattened_key.split("%")
+    cur_dict = nested_dict
+    for k in key_splits[:-1]:
+        if k not in cur_dict:
+            cur_dict[k] = {}
+        cur_dict = cur_dict[k]
+    cur_dict[key_splits[-1]] = value
+
+
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -168,7 +194,7 @@ def __init__(
         self.shm_broadcaster: Optional[ShmRingBufferIO] = None
         if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
             self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
-                self.cpu_group, 1 << 20, 6)
+                self.cpu_group, 1 << 22, 6)
 
     @property
     def first_rank(self):
@@ -180,6 +206,16 @@ def last_rank(self):
         """Return the global rank of the last process in the group"""
         return self.ranks[-1]
 
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
     @property
     def next_rank(self):
         """Return the global rank of the process that follows the caller"""
@@ -374,13 +410,77 @@ def broadcast_object_list(self,
                                                 group=self.device_group)
         return obj_list
 
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=src,
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=src,
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
     def broadcast_tensor_dict(
         self,
-        tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
         src: int = 0,
         group: Optional[ProcessGroup] = None,
         metadata_group: Optional[ProcessGroup] = None
-    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Broadcast the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -436,7 +536,7 @@ def broadcast_tensor_dict(
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        tensor_dict[key] = tensor
+                        _update_nested_dict(tensor_dict, key, tensor)
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
@@ -452,13 +552,95 @@ def broadcast_tensor_dict(
                                                              group=group,
                                                              async_op=True)
                     async_handles.append(handle)
-                    tensor_dict[key] = tensor
+                    _update_nested_dict(tensor_dict, key, tensor)
                 else:
-                    tensor_dict[key] = value
+                    _update_nested_dict(tensor_dict, key, value)
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
 
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = self.next_rank
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor, dst=dst, group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor, dst=dst, group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = self.prev_rank
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    _update_nested_dict(tensor_dict, key, tensor)
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=src,
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor, src=src, group=group)
+                _update_nested_dict(tensor_dict, key, tensor)
+            else:
+                _update_nested_dict(tensor_dict, key, value)
+        return tensor_dict
+
     def barrier(self):
         """Barrier synchronization among the group.
         NOTE: don't use `device_group` here! `barrier` in NCCL is
@@ -468,6 +650,35 @@ def barrier(self):
         """
         torch.distributed.barrier(group=self.cpu_group)
 
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = self.next_rank
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = self.prev_rank
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)
@@ -479,6 +690,8 @@ def destroy(self):
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
+        if self.shm_broadcaster is not None:
+            self.shm_broadcaster = None
 
 
 _WORLD: Optional[GroupCoordinator] = None
@@ -489,6 +702,28 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def init_world_group(ranks: List[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+    )
+
+
+def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int,
+                              backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
+
+
 _TP: Optional[GroupCoordinator] = None
 
 
@@ -577,13 +812,7 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _WORLD = GroupCoordinator(
-            group_ranks=[ranks],
-            local_rank=local_rank,
-            torch_distributed_backend=backend,
-            use_pynccl=False,
-            use_custom_allreduce=False,
-        )
+        _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -640,13 +869,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    _TP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -658,13 +882,8 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    _PP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
 
 def ensure_model_parallel_initialized(
@@ -700,6 +919,34 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d12fe84d29406..b6a33feab910b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -98,12 +98,15 @@ class EngineArgs:
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
+    speculative_draft_tensor_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
     speculative_disable_by_batch_size: Optional[int] = None
     ngram_prompt_lookup_max: Optional[int] = None
     ngram_prompt_lookup_min: Optional[int] = None
-
+    spec_decoding_acceptance_method: str = 'rejection_sampler'
+    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
+    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
     qlora_adapter_name_or_path: Optional[str] = None
 
     otlp_traces_endpoint: Optional[str] = None
@@ -518,12 +521,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
-        parser.add_argument(
-            "--device",
-            type=str,
-            default=EngineArgs.device,
-            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"],
-            help='Device type for vLLM execution.')
+        parser.add_argument("--device",
+                            type=str,
+                            default=EngineArgs.device,
+                            choices=[
+                                "auto", "cuda", "neuron", "cpu", "openvino",
+                                "tpu", "xpu"
+                            ],
+                            help='Device type for vLLM execution.')
 
         # Related to Vision-language models such as llava
         parser = EngineArgs.add_cli_args_for_vlm(parser)
@@ -552,6 +557,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-draft-tensor-parallel-size',
+            '-spec-draft-tp',
+            type=int,
+            default=EngineArgs.speculative_draft_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -582,6 +594,38 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Min size of window for ngram prompt lookup in speculative '
             'decoding.')
 
+        parser.add_argument(
+            '--spec-decoding-acceptance-method',
+            type=str,
+            default=EngineArgs.spec_decoding_acceptance_method,
+            choices=['rejection_sampler', 'typical_acceptance_sampler'],
+            help='Specify the acceptance method to use during draft token '
+            'verification in speculative decoding. Two types of acceptance '
+            'routines are supported: '
+            '1) RejectionSampler which does not allow changing the '
+            'acceptance rate of draft tokens, '
+            '2) TypicalAcceptanceSampler which is configurable, allowing for '
+            'a higher acceptance rate at the cost of lower quality, '
+            'and vice versa.')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-threshold',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
+            help='Set the lower bound threshold for the posterior '
+            'probability of a token to be accepted. This threshold is '
+            'used by the TypicalAcceptanceSampler to make sampling decisions '
+            'during speculative decoding. Defaults to 0.09')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-alpha',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
+            help='A scaling factor for the entropy-based threshold for token '
+            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
+            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
+            'i.e. 0.3')
+
         parser.add_argument('--model-loader-extra-config',
                             type=nullable_str,
                             default=EngineArgs.model_loader_extra_config,
@@ -650,6 +694,36 @@ def create_engine_config(self, ) -> EngineConfig:
             raise ValueError(
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
+        if self.image_input_type:
+            if (not self.image_token_id or not self.image_input_shape
+                    or not self.image_feature_size):
+                raise ValueError(
+                    'Specify `image_token_id`, `image_input_shape` and '
+                    '`image_feature_size` together with `image_input_type`.')
+
+            if self.image_processor is None:
+                self.image_processor = self.model
+            if self.disable_image_processor:
+                if self.image_processor != self.model:
+                    warnings.warn(
+                        "You've specified an image processor "
+                        f"({self.image_processor}) but also disabled "
+                        "it via `--disable-image-processor`.",
+                        stacklevel=2)
+
+                self.image_processor = None
+
+            vision_language_config = VisionLanguageConfig(
+                image_input_type=VisionLanguageConfig.
+                get_image_input_enum_type(self.image_input_type),
+                image_token_id=self.image_token_id,
+                image_input_shape=str_to_int_tuple(self.image_input_shape),
+                image_feature_size=self.image_feature_size,
+                image_processor=self.image_processor,
+                image_processor_revision=self.image_processor_revision,
+            )
+        else:
+            vision_language_config = None
 
         device_config = DeviceConfig(device=self.device)
         model_config = ModelConfig(
@@ -674,7 +748,8 @@ def create_engine_config(self, ) -> EngineConfig:
             max_logprobs=self.max_logprobs,
             disable_sliding_window=self.disable_sliding_window,
             skip_tokenizer_init=self.skip_tokenizer_init,
-            served_model_name=self.served_model_name)
+            served_model_name=self.served_model_name,
+            multimodal_config=vision_language_config)
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -702,6 +777,8 @@ def create_engine_config(self, ) -> EngineConfig:
             target_parallel_config=parallel_config,
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
+            speculative_draft_tensor_parallel_size = \
+                self.speculative_draft_tensor_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_by_batch_size=self.
             speculative_disable_by_batch_size,
@@ -710,6 +787,12 @@ def create_engine_config(self, ) -> EngineConfig:
             use_v2_block_manager=self.use_v2_block_manager,
             ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
             ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
+            draft_token_acceptance_method=\
+                self.spec_decoding_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=self.
+            typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=self.
+            typical_acceptance_sampler_posterior_alpha,
         )
 
         scheduler_config = SchedulerConfig(
@@ -748,37 +831,6 @@ def create_engine_config(self, ) -> EngineConfig:
             model_loader_extra_config=self.model_loader_extra_config,
         )
 
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
-                raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
-
-            vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
-                image_token_id=self.image_token_id,
-                image_input_shape=str_to_int_tuple(self.image_input_shape),
-                image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
-            )
-        else:
-            vision_language_config = None
-
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df25eb111e87f..7db3bb28c6ee5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -278,9 +278,11 @@ async def process_model_inputs_async(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     async def add_request_async(
         self,
@@ -310,6 +312,8 @@ async def add_request_async(
         )
 
     async def check_health_async(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
 
@@ -389,6 +393,12 @@ def from_engine_args(
                 "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "openvino":
+            assert distributed_executor_backend is None, (
+                "Distributed execution is not supported with "
+                "the OpenVINO backend.")
+            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
+            executor_class = OpenVINOExecutorAsync
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend is None:
                 from vllm.executor.xpu_executor import XPUExecutorAsync
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index cc0ce4a186a88..3d82838493671 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,10 +1,10 @@
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
-from transformers import GenerationConfig, PreTrainedTokenizer
+from transformers import PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                          LoRAConfig, ModelConfig, ObservabilityConfig,
@@ -13,14 +13,15 @@
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics import StatLogger, Stats
+from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger,
+                                 StatLoggerBase, Stats)
 from vllm.engine.output_processor.interfaces import (
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import LLMInputs, PromptInputs
+from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
@@ -33,6 +34,7 @@
                            SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
+from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -45,16 +47,18 @@
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
-def _load_generation_config_dict(model_config: ModelConfig):
-    try:
-        return GenerationConfig.from_pretrained(
-            model_config.model,
-            revision=model_config.revision,
-        ).to_diff_dict()
-    except OSError:
-        # Not found.
+def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
+    config = try_get_generation_config(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.revision,
+    )
+
+    if config is None:
         return {}
 
+    return config.to_diff_dict()
+
 
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
 
@@ -160,6 +164,7 @@ def __init__(
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> None:
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
@@ -229,6 +234,9 @@ def __init__(
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
 
+        self.input_processor = INPUT_REGISTRY.create_input_processor(
+            self.model_config)
+
         self.model_executor = executor_class(
             model_config=model_config,
             cache_config=cache_config,
@@ -291,11 +299,21 @@ def __init__(
 
         # Metric Logging.
         if self.log_stats:
-            self.stat_logger = StatLogger(
-                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.served_model_name),
-                max_model_len=self.model_config.max_model_len)
-            self.stat_logger.info("cache_config", self.cache_config)
+            if stat_loggers is not None:
+                self.stat_loggers = stat_loggers
+            else:
+                self.stat_loggers = {
+                    "logging":
+                    LoggingStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+                    "prometheus":
+                    PrometheusStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        labels=dict(model_name=model_config.served_model_name),
+                        max_model_len=self.model_config.max_model_len),
+                }
+                self.stat_loggers["prometheus"].info("cache_config",
+                                                     self.cache_config)
 
         self.tracer = None
         if self.observability_config.otlp_traces_endpoint:
@@ -362,6 +380,9 @@ def from_engine_args(
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "openvino":
+            from vllm.executor.openvino_executor import OpenVINOExecutor
+            executor_class = OpenVINOExecutor
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
@@ -513,9 +534,11 @@ def process_model_inputs(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     def add_request(
         self,
@@ -608,12 +631,9 @@ def _create_sequence_group_with_sampling(
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
         sampling_params = sampling_params.clone()
-        # Add the eos token id into the sampling_params to support min_tokens
-        # processing
-        if seq.eos_token_id is not None:
-            sampling_params.all_stop_token_ids.add(seq.eos_token_id)
+
         sampling_params.update_from_generation_config(
-            self.generation_config_fields)
+            self.generation_config_fields, seq.eos_token_id)
 
         # Create the sequence group.
         seq_group = SequenceGroup(
@@ -830,14 +850,24 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
 
         return request_outputs
 
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if logger_name in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} already exists.")
+        self.stat_loggers[logger_name] = logger
+
+    def remove_logger(self, logger_name: str) -> None:
+        if logger_name not in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} does not exist.")
+        del self.stat_loggers[logger_name]
+
     def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
             model_output: Optional[List[SamplerOutput]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
-            self.stat_logger.log(
-                self._get_stats(scheduler_outputs, model_output))
+            for logger in self.stat_loggers.values():
+                logger.log(self._get_stats(scheduler_outputs, model_output))
 
     def _get_stats(
             self,
@@ -1015,6 +1045,8 @@ def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
     def check_health(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 027f5c7e73c2b..77de42bc0ed5d 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,21 +1,27 @@
 import time
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
 from typing import Dict, List, Optional, Protocol, Union
 
 import numpy as np
-from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
-                               disable_created_metrics)
+import prometheus_client
 
+from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 
+if ray is not None:
+    from ray.util import metrics as ray_metrics
+else:
+    ray_metrics = None
+
 if TYPE_CHECKING:
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 logger = init_logger(__name__)
 
-disable_created_metrics()
+prometheus_client.disable_created_metrics()
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
@@ -24,56 +30,55 @@
 # begin-metrics-definitions
 class Metrics:
     labelname_finish_reason = "finished_reason"
+    _base_library = prometheus_client
 
     def __init__(self, labelnames: List[str], max_model_len: int):
         # Unregister any existing vLLM collectors
-        for collector in list(REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                REGISTRY.unregister(collector)
+        self._unregister_vllm_metrics()
 
         # Config Information
-        self.info_cache_config = Info(
+        self.info_cache_config = prometheus_client.Info(
             name='vllm:cache_config',
             documentation='information of cache_config')
 
         # System stats
         #   Scheduler State
-        self.gauge_scheduler_running = Gauge(
+        self.gauge_scheduler_running = self._base_library.Gauge(
             name="vllm:num_requests_running",
             documentation="Number of requests currently running on GPU.",
             labelnames=labelnames)
-        self.gauge_scheduler_waiting = Gauge(
+        self.gauge_scheduler_waiting = self._base_library.Gauge(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames)
-        self.gauge_scheduler_swapped = Gauge(
+        self.gauge_scheduler_swapped = self._base_library.Gauge(
             name="vllm:num_requests_swapped",
             documentation="Number of requests swapped to CPU.",
             labelnames=labelnames)
         #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = Gauge(
+        self.gauge_gpu_cache_usage = self._base_library.Gauge(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
-        self.gauge_cpu_cache_usage = Gauge(
+        self.gauge_cpu_cache_usage = self._base_library.Gauge(
             name="vllm:cpu_cache_usage_perc",
             documentation="CPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
 
         # Iteration stats
-        self.counter_num_preemption = Counter(
+        self.counter_num_preemption = self._base_library.Counter(
             name="vllm:num_preemptions_total",
             documentation="Cumulative number of preemption from the engine.",
             labelnames=labelnames)
-        self.counter_prompt_tokens = Counter(
+        self.counter_prompt_tokens = self._base_library.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames)
-        self.counter_generation_tokens = Counter(
+        self.counter_generation_tokens = self._base_library.Counter(
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        self.histogram_time_to_first_token = Histogram(
+        self.histogram_time_to_first_token = self._base_library.Histogram(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
             labelnames=labelnames,
@@ -81,7 +86,7 @@ def __init__(self, labelnames: List[str], max_model_len: int):
                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
             ])
-        self.histogram_time_per_output_token = Histogram(
+        self.histogram_time_per_output_token = self._base_library.Histogram(
             name="vllm:time_per_output_token_seconds",
             documentation="Histogram of time per output token in seconds.",
             labelnames=labelnames,
@@ -92,54 +97,77 @@ def __init__(self, labelnames: List[str], max_model_len: int):
 
         # Request stats
         #   Latency
-        self.histogram_e2e_time_request = Histogram(
+        self.histogram_e2e_time_request = self._base_library.Histogram(
             name="vllm:e2e_request_latency_seconds",
             documentation="Histogram of end to end request latency in seconds.",
             labelnames=labelnames,
             buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
         #   Metadata
-        self.histogram_num_prompt_tokens_request = Histogram(
+        self.histogram_num_prompt_tokens_request = self._base_library.Histogram(
             name="vllm:request_prompt_tokens",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames,
             buckets=build_1_2_5_buckets(max_model_len),
         )
-        self.histogram_num_generation_tokens_request = Histogram(
-            name="vllm:request_generation_tokens",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.histogram_best_of_request = Histogram(
+        self.histogram_num_generation_tokens_request = \
+            self._base_library.Histogram(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                labelnames=labelnames,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        self.histogram_best_of_request = self._base_library.Histogram(
             name="vllm:request_params_best_of",
             documentation="Histogram of the best_of request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.histogram_n_request = Histogram(
+        self.histogram_n_request = self._base_library.Histogram(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.counter_request_success = Counter(
+        self.counter_request_success = self._base_library.Counter(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
         # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = Gauge(
+        self.gauge_avg_prompt_throughput = self._base_library.Gauge(
             name="vllm:avg_prompt_throughput_toks_per_s",
             documentation="Average prefill throughput in tokens/s.",
             labelnames=labelnames,
         )
         # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = Gauge(
+        self.gauge_avg_generation_throughput = self._base_library.Gauge(
             name="vllm:avg_generation_throughput_toks_per_s",
             documentation="Average generation throughput in tokens/s.",
             labelnames=labelnames,
         )
 
+    def _unregister_vllm_metrics(self) -> None:
+        for collector in list(self._base_library.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                self._base_library.REGISTRY.unregister(collector)
+
+
+class RayMetrics(Metrics):
+    """
+    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
+    Provides the same metrics as Metrics but uses Ray's util.metrics library.
+    """
+    _base_library = ray_metrics
+
+    def __init__(self, labelnames: List[str], max_model_len: int):
+        if ray_metrics is None:
+            raise ImportError("RayMetrics requires Ray to be installed.")
+        super().__init__(labelnames, max_model_len)
+
+    def _unregister_vllm_metrics(self) -> None:
+        # No-op on purpose
+        pass
+
 
 # end-metrics-definitions
 
@@ -206,34 +234,136 @@ def metrics_info(self) -> Dict[str, str]:
         ...
 
 
-class StatLogger:
-    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
+def local_interval_elapsed(now: float, last_log: float,
+                           local_interval: float) -> bool:
+    elapsed_time = now - last_log
+    return elapsed_time > local_interval
+
+
+def get_throughput(tracked_stats: List[int], now: float,
+                   last_log: float) -> float:
+    return float(np.sum(tracked_stats) / (now - last_log))
 
-    def __init__(self, local_interval: float, labels: Dict[str, str],
-                 max_model_len: int) -> None:
-        # Metadata for logging locally.
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
 
+class StatLoggerBase(ABC):
+    """Base class for StatLogger."""
+
+    def __init__(self, local_interval: float) -> None:
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
+        self.last_local_log = time.time()
+        self.local_interval = local_interval
+
+    @abstractmethod
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def log(self, stats: Stats) -> None:
+        raise NotImplementedError
 
+
+class LoggingStatLogger(StatLoggerBase):
+    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to Stdout every self.local_interval seconds."""
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+
+        # Log locally every local_interval seconds.
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
+            # Log to stdout.
+            logger.info(
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+                "CPU KV cache usage: %.1f%%.",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running_sys,
+                stats.num_swapped_sys,
+                stats.num_waiting_sys,
+                stats.gpu_cache_usage_sys * 100,
+                stats.cpu_cache_usage_sys * 100,
+            )
+
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+
+            if stats.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        stats.spec_decode_metrics))
+
+    def _format_spec_decode_metrics_str(
+            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+
+        return ("Speculative metrics: "
+                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+                f"System efficiency: {metrics.system_efficiency:.3f}, "
+                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+
+
+class PrometheusStatLogger(StatLoggerBase):
+    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
+    _metrics_cls = Metrics
+
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 max_model_len: int) -> None:
+        super().__init__(local_interval)
         # Prometheus metrics
         self.labels = labels
-        self.metrics = Metrics(labelnames=list(labels.keys()),
-                               max_model_len=max_model_len)
+        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
+                                         max_model_len=max_model_len)
 
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         if type == "cache_config":
             self.metrics.info_cache_config.info(obj.metrics_info())
 
-    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
-        return float(np.sum(tracked_stats) / (now - self.last_local_log))
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
 
-    def _local_interval_elapsed(self, now: float) -> bool:
-        elapsed_time = now - self.last_local_log
-        return elapsed_time > self.local_interval
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+
+    def _log_counter_labels(self, counter, data: CollectionsCounter,
+                            label_key: str) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+
+    def _log_histogram(self, histogram, data: Union[List[int],
+                                                    List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
 
     def _log_prometheus(self, stats: Stats) -> None:
         # System state data
@@ -279,26 +409,6 @@ def _log_prometheus(self, stats: Stats) -> None:
         self._log_histogram(self.metrics.histogram_best_of_request,
                             stats.best_of_requests)
 
-    def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
-                            label_key: str) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram: Histogram,
-                       data: Union[List[int], List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
         # Logs metrics to prometheus that are computed every logging_interval.
@@ -313,11 +423,8 @@ def _log_prometheus_interval(self, prompt_throughput: float,
         self.metrics.gauge_avg_generation_throughput.labels(
             **self.labels).set(generation_throughput)
 
-    def log(self, stats: Stats) -> None:
-        """Called by LLMEngine.
-           Logs to prometheus and tracked stats every iteration.
-           Logs to Stdout every self.local_interval seconds."""
-
+    def log(self, stats: Stats):
+        """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
         self._log_prometheus(stats)
 
@@ -326,50 +433,28 @@ def log(self, stats: Stats) -> None:
         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 
         # Log locally every local_interval seconds.
-        if self._local_interval_elapsed(stats.now):
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
             # Compute summary metrics for tracked stats (and log them
             # to promethus if applicable).
-            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
-                                                     now=stats.now)
-            generation_throughput = self._get_throughput(
-                self.num_generation_tokens, now=stats.now)
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
             self._log_prometheus_interval(
                 prompt_throughput=prompt_throughput,
                 generation_throughput=generation_throughput)
 
-            # Log to stdout.
-            logger.info(
-                "Avg prompt throughput: %.1f tokens/s, "
-                "Avg generation throughput: %.1f tokens/s, "
-                "Running: %d reqs, Swapped: %d reqs, "
-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%.",
-                prompt_throughput,
-                generation_throughput,
-                stats.num_running_sys,
-                stats.num_swapped_sys,
-                stats.num_waiting_sys,
-                stats.gpu_cache_usage_sys * 100,
-                stats.cpu_cache_usage_sys * 100,
-            )
-
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
             self.num_generation_tokens = []
             self.last_local_log = stats.now
 
-            if stats.spec_decode_metrics is not None:
-                logger.info(
-                    self._format_spec_decode_metrics_str(
-                        stats.spec_decode_metrics))
-
-    def _format_spec_decode_metrics_str(
-            self, metrics: "SpecDecodeWorkerMetrics") -> str:
 
-        return ("Speculative metrics: "
-                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
-                f"System efficiency: {metrics.system_efficiency:.3f}, "
-                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
-                f"Number of accepted tokens: {metrics.accepted_tokens}, "
-                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
-                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+class RayPrometheusStatLogger(PrometheusStatLogger):
+    """RayPrometheusStatLogger uses Ray metrics instead."""
+    _metrics_cls = RayMetrics
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ea6275920c79d..a708176c254ec 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -19,10 +19,17 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ChatCompletionResponse,
                                               CompletionRequest,
-                                              EmbeddingRequest, ErrorResponse)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              EmbeddingRequest, ErrorResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse)
+# yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@@ -85,6 +92,28 @@ async def health() -> Response:
     return Response(status_code=200)
 
 
+@app.post("/tokenize")
+async def tokenize(request: TokenizeRequest):
+    generator = await openai_serving_completion.create_tokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, TokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
+@app.post("/detokenize")
+async def detokenize(request: DetokenizeRequest):
+    generator = await openai_serving_completion.create_detokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, DetokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
 @app.get("/v1/models")
 async def show_available_models():
     models = await openai_serving_chat.show_available_models()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b57d79859aec5..d1568cb3a773c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -234,15 +234,22 @@ def to_sampling_params(self) -> SamplingParams:
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer before we add to LLMEngine
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
@@ -419,15 +426,22 @@ def to_sampling_params(self):
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
@@ -566,7 +580,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
 class EmbeddingResponseData(BaseModel):
     index: int
     object: str = "embedding"
-    embedding: List[float]
+    embedding: Union[List[float], str]
 
 
 class EmbeddingResponse(BaseModel):
@@ -699,3 +713,24 @@ class BatchRequestOutput(OpenAIBaseModel):
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
     error: Optional[Any]
+
+
+class TokenizeRequest(OpenAIBaseModel):
+    model: str
+    prompt: str
+    add_special_tokens: bool = Field(default=True)
+
+
+class TokenizeResponse(OpenAIBaseModel):
+    tokens: List[int]
+    count: int
+    max_model_len: int
+
+
+class DetokenizeRequest(OpenAIBaseModel):
+    model: str
+    tokens: List[int]
+
+
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c775fa6daa739..8741893c92716 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -16,7 +16,11 @@
                                               CompletionResponseChoice,
                                               CompletionResponseStreamChoice,
                                               CompletionStreamResponse,
-                                              UsageInfo)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse, UsageInfo)
+# yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
@@ -442,3 +446,29 @@ def _create_completion_logprobs(
             tokens=out_tokens,
             top_logprobs=out_top_logprobs,
         )
+
+    async def create_tokenize(self,
+                              request: TokenizeRequest) -> TokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request,
+            prompt=request.prompt,
+            add_special_tokens=request.add_special_tokens)
+
+        return TokenizeResponse(tokens=input_ids,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+
+    async def create_detokenize(
+            self, request: DetokenizeRequest) -> DetokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request, prompt_ids=request.tokens)
+
+        return DetokenizeResponse(prompt=input_text)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index cbf09f173fb66..4838cb7d0255a 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,6 +1,8 @@
+import base64
 import time
 from typing import AsyncIterator, List, Optional, Tuple
 
+import numpy as np
 from fastapi import Request
 
 from vllm.config import ModelConfig
@@ -20,19 +22,18 @@
 
 
 def request_output_to_embedding_response(
-    final_res_batch: List[EmbeddingRequestOutput],
-    request_id: str,
-    created_time: int,
-    model_name: str,
-) -> EmbeddingResponse:
+        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        created_time: int, model_name: str,
+        encoding_format: str) -> EmbeddingResponse:
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
         assert final_res is not None
         prompt_token_ids = final_res.prompt_token_ids
-
-        embedding_data = EmbeddingResponseData(
-            index=idx, embedding=final_res.outputs.embedding)
+        embedding = final_res.outputs.embedding
+        if encoding_format == "base64":
+            embedding = base64.b64encode(np.array(embedding))
+        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
         num_prompt_tokens += len(prompt_token_ids)
@@ -72,10 +73,8 @@ async def create_embedding(self, request: EmbeddingRequest,
         if error_check_ret is not None:
             return error_check_ret
 
-        # Return error for unsupported features.
-        if request.encoding_format == "base64":
-            return self.create_error_response(
-                "base64 encoding is not currently supported")
+        encoding_format = (request.encoding_format
+                           if request.encoding_format else "float")
         if request.dimensions is not None:
             return self.create_error_response(
                 "dimensions is currently not supported")
@@ -129,7 +128,8 @@ async def create_embedding(self, request: EmbeddingRequest,
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
             response = request_output_to_embedding_response(
-                final_res_batch, request_id, created_time, model_name)
+                final_res_batch, request_id, created_time, model_name,
+                encoding_format)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 6b5a62efc7f20..84e4127725bb7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,9 +10,10 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
+                                              DetokenizeRequest,
                                               EmbeddingRequest, ErrorResponse,
                                               ModelCard, ModelList,
-                                              ModelPermission)
+                                              ModelPermission, TokenizeRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
@@ -99,8 +100,9 @@ def create_streaming_error_response(
         return json_str
 
     async def _check_model(
-        self, request: Union[CompletionRequest, ChatCompletionRequest,
-                             EmbeddingRequest]
+        self, request: Union[ChatCompletionRequest, CompletionRequest,
+                             DetokenizeRequest, EmbeddingRequest,
+                             TokenizeRequest]
     ) -> Optional[ErrorResponse]:
         if request.model in self.served_model_names:
             return None
@@ -126,7 +128,8 @@ def _maybe_get_lora(
     def _validate_prompt_and_tokenize(
             self,
             request: Union[ChatCompletionRequest, CompletionRequest,
-                           EmbeddingRequest],
+                           DetokenizeRequest, EmbeddingRequest,
+                           TokenizeRequest],
             prompt: Optional[str] = None,
             prompt_ids: Optional[List[int]] = None,
             truncate_prompt_tokens: Optional[Annotated[int,
@@ -174,6 +177,11 @@ def _validate_prompt_and_tokenize(
                     f"generation. Please reduce the length of the input.", )
             return input_ids, input_text
 
+        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
+        # and does not require model context length validation
+        if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
+            return input_ids, input_text
+
         if request.max_tokens is None:
             if token_num >= self.max_model_len:
                 raise ValueError(
diff --git a/vllm/envs.py b/vllm/envs.py
index 89dffc5c14f06..0fe3e6f660628 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,7 +28,11 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -49,7 +53,8 @@
 
     # ================== Installation Time Env Vars ==================
 
-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
@@ -208,6 +213,22 @@
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "VLLM_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
+
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+
     # If the env var is set, it uses the Ray's compiled DAG API
     # which optimizes the control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -228,6 +249,8 @@
     # Only used for XLA devices such as TPUs.
     "VLLM_XLA_CACHE_PATH":
     lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index 235b5bc47021d..d8693e636ac85 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -64,8 +64,8 @@ def initialize_cache(self, num_gpu_blocks: int,
                           num_cpu_blocks=num_cpu_blocks)
 
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
         if self.parallel_worker_tasks is None:
             self.parallel_worker_tasks = self._run_workers(
                 "start_worker_execution_loop",
@@ -79,7 +79,7 @@ def stop_remote_worker_execution_loop(self) -> None:
         if self.parallel_worker_tasks is None:
             return
 
-        self._driver_execute_model()
+        self._driver_execute_model(execute_model_req=None)
         parallel_worker_tasks = self.parallel_worker_tasks
         self.parallel_worker_tasks = None
         # Ensure that workers exit model loop cleanly
@@ -123,13 +123,13 @@ def save_sharded_state(
 
     @abstractmethod
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
         """
         raise NotImplementedError
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 7c2520b5a64f5..d7c19622e270a 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -69,8 +69,8 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     @abstractmethod
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
         """Executes at least one model step on the given sequences."""
         raise NotImplementedError
 
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 0a654200ed796..5522b5322e66c 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -87,7 +87,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
         output = self.driver_worker.execute_model(execute_model_req)
         return output
 
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index e63e5a3a027fa..6aebb4702889a 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -11,7 +11,8 @@
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
                         get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        get_vllm_instance_id, make_async,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -25,8 +26,9 @@ def _init_executor(self) -> None:
 
         # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            os.environ["CUDA_VISIBLE_DEVICES"] = (",".join(
-                map(str, range(world_size))))
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
 
         # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
         os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
@@ -76,16 +78,14 @@ def shutdown(self):
             worker_monitor.close()
 
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
         loop running in each of the remote workers.
         """
-        return self.driver_worker.execute_model(
-            execute_model_req=execute_model_req)
+        return self.driver_worker.execute_model(execute_model_req)
 
     def _run_workers(
         self,
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index c5e2fb0f67736..53107dada9962 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -48,15 +48,14 @@ def initialize_cache(self, num_gpu_blocks: int,
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (execute_model_req.blocks_to_swap_in == {}
-                and execute_model_req.blocks_to_swap_out == {}
-                and execute_model_req.blocks_to_copy == {}), (
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
                     "Cache operations are not supported for Neuron backend.")
         assert execute_model_req.num_lookahead_slots == 0, (
             "lookahead not supported for Neuron backend.")
 
-        output = self.driver_worker.execute_model(
-            execute_model_req.seq_group_metadata_list)
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
new file mode 100644
index 0000000000000..8af375371f2f0
--- /dev/null
+++ b/vllm/executor/openvino_executor.py
@@ -0,0 +1,163 @@
+from typing import List, Set, Tuple
+
+import openvino as ov
+import openvino.properties.hint as hints
+import torch
+
+import vllm.envs as envs
+from vllm.config import CacheConfig, ModelConfig
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class OpenVINOExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "openvino"
+        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+
+        # Instantiate the worker and load the model to CPU.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.openvino_worker import OpenVINOWorker
+
+        assert (
+            self.parallel_config.world_size == 1
+        ), "OpenVINOExecutor only supports single CPU socket currently."
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = OpenVINOWorker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        # NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
+        # referred as `gpu block`. Because we want to reuse the existing block
+        # management procedure.
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def check_health(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
+
+    async def check_health_async(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype != torch.float32:
+        logger.warning(
+            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
+        )
+        config.dtype = torch.float32
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on OpenVINO backend, fallback to the "
+            "eager mode.")
+        config.enforce_eager = True
+    return config
+
+
+def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+        logger.info("KV cache type is overried to u8 via "
+                    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+        config.cache_dtype = ov.Type.u8
+    else:
+        core = ov.Core()
+        inference_precision = core.get_property("CPU",
+                                                hints.inference_precision)
+        if inference_precision == ov.Type.bf16:
+            config.cache_dtype = ov.Type.bf16
+        else:
+            config.cache_dtype = ov.Type.f16
+
+    if config.block_size != 32:
+        logger.info(
+            f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
+        )
+        config.block_size = 32
+
+    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+    if kv_cache_space >= 0:
+        _GB = 1 << 30
+        if kv_cache_space == 0:
+            config.openvino_kvcache_space_bytes = 4 * _GB  # type: ignore
+            logger.warning(
+                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+                "for OpenVINO backend is not set, using 4 by default.")
+        else:
+            config.openvino_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+    else:
+        raise RuntimeError(
+            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+            f" {kv_cache_space}, expect a positive integer value.")
+
+    return config
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index fc83c552888a6..faa500c2d79ca 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -190,9 +190,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_parallel_loading_workers)
 
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 5ed00e1374100..7fe5349c987ad 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 
@@ -26,29 +26,45 @@ def _init_executor(self) -> None:
             self.model_config.dtype = torch.bfloat16
 
         # Instantiate the worker and load the model to the device.
-        self._init_worker()
-
-    def _init_worker(self):
-        from vllm.worker.tpu_worker import TPUWorker
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
 
-        assert self.parallel_config.world_size == 1, (
-            "TPUExecutor currently only supports a single TPU chip.")
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = TPUWorker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            self.cache_config,
-            self.load_config,
-            self.vision_language_config,
-            local_rank=0,
-            rank=0,
+    def _get_worker_kwargs(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=local_rank,
+            rank=rank,
             distributed_init_method=distributed_init_method,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
+                                                     distributed_init_method))
+        return worker
 
     def initialize_cache(
         self,
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
new file mode 100644
index 0000000000000..d094156962955
--- /dev/null
+++ b/vllm/inputs/__init__.py
@@ -0,0 +1,19 @@
+from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
+                   PromptStrictInputs, TextPrompt, TextTokensPrompt,
+                   TokensPrompt, parse_and_batch_prompt)
+from .registry import InputContext, InputRegistry
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
+
+__all__ = [
+    "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
+    "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs",
+    "LLMInputs", "INPUT_REGISTRY", "InputContext", "InputRegistry"
+]
diff --git a/vllm/inputs.py b/vllm/inputs/data.py
similarity index 90%
rename from vllm/inputs.py
rename to vllm/inputs/data.py
index 026903e19a26e..9b163b9cfb666 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs/data.py
@@ -101,8 +101,7 @@ class TextTokensPrompt(TypedDict):
     """The prompt text."""
 
     prompt_token_ids: List[int]
-    """The token IDs of the prompt. If None, we use the
-    tokenizer to convert the prompts to token IDs."""
+    """The token IDs of the prompt."""
 
     multi_modal_data: NotRequired["MultiModalData"]
     """
@@ -125,6 +124,21 @@ class TextTokensPrompt(TypedDict):
 
 
 class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+    """
+
     prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
     prompt: NotRequired[Optional[str]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
new file mode 100644
index 0000000000000..8f4e108b8cca5
--- /dev/null
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,207 @@
+import functools
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
+                    TypeVar)
+
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+
+from .data import LLMInputs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VisionLanguageConfig
+    from vllm.multimodal import MultiModalData
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+C = TypeVar("C", bound=PretrainedConfig)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: "ModelConfig"
+    """The configuration of the model."""
+
+    def get_multimodal_config(self) -> "VisionLanguageConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+
+        multimodal_config = self.model_config.multimodal_config
+        if multimodal_config is None:
+            raise ValueError("No multimodal config found")
+
+        return multimodal_config
+
+    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            ValueError: If the model is not of the specified type.
+        """
+
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+
+N = TypeVar("N", bound=Type[nn.Module])
+
+DummyDataFactory = Callable[[InputContext, int],
+                            Tuple["SequenceData", Optional["MultiModalData"]]]
+"""
+Create dummy data to be inputted into the model.
+
+Note:
+    :data:`InputProcessor` is not applied to the dummy data.
+"""
+
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
+
+
+class InputRegistry:
+    """
+    A registry to dispatch data processing
+    according to the target model.
+    """
+
+    def __init__(self) -> None:
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+                                                  DummyDataFactory] = {}
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
+                                                   InputProcessor] = {}
+
+    def _default_dummy_data_factory(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+        """
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        dummy_seq_data = SequenceData([0] * seq_len)
+        dummy_multi_modal_data = None
+
+        return dummy_seq_data, dummy_multi_modal_data
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def dummy_data_for_profiling(self, model_config: "ModelConfig",
+                                 seq_len: int):
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        TODO: Add guide [ref: PR #5276]
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        dummy_factory = self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+        return dummy_factory(InputContext(model_config), seq_len)
+
+    def _default_input_processor(self, ctx: InputContext,
+                                 inputs: LLMInputs) -> LLMInputs:
+        """The default input processor is a no-op."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors_by_model_type:
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: LLMInputs) -> LLMInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        processor = self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
+
+        return processor(InputContext(model_config), inputs)
+
+    def create_input_processor(self, model_config: "ModelConfig"):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e4a23273f7282..2fddfccaf1e4c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1069,6 +1069,10 @@ def vocab_size(self):
     def scale(self):
         return self.base_layer.scale
 
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+
     @property
     def org_vocab_size(self):
         return self.base_layer.org_vocab_size
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 8f3c7f76932af..14081b5ba441c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -2,6 +2,7 @@
 from typing import Sequence as GenericSequence
 
 import torch
+import torch.types
 
 from vllm.utils import is_pin_memory_available
 
@@ -64,7 +65,7 @@ def create_dummy_lora_weights(
             output_dim: int,
             rank: int,
             dtype: torch.dtype,
-            device: torch.device,
+            device: torch.types.Device,
             embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros([input_dim, rank],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index afb9ba4550671..689835def83dd 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -18,6 +18,7 @@
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.utils import LRUCache, is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -302,25 +303,54 @@ def from_local_checkpoint(
                                                     "new_embeddings.bin")
         with open(lora_config_path) as f:
             config = json.load(f)
-        target_modules = config["target_modules"]
-        unexpected_modules = []
-        for module in target_modules:
-            # Compatible with more modules, such as:layers.11.self_attn.k_proj
-            part_name = module.split(".")[-1]
-            if part_name not in expected_lora_modules:
-                unexpected_modules.append(module)
-        # loaded lora's target modules must be a subset of expected_lora_modules
-
-        if unexpected_modules:
-            print(unexpected_modules, "modules")
-            raise ValueError(
-                f"While loading {lora_dir}, expected"
-                f" target modules in {expected_lora_modules}"
-                f" but received {unexpected_modules}."
-                f" Please verify that the loaded LoRA module is correct")
         if os.path.isfile(lora_tensor_path):
-            tensors = safetensors.torch.load_file(lora_tensor_path)
+            tensors: Dict[str, torch.Tensor] = {}
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path,
+                                       framework="pt") as f:  # type: ignore
+                for lora_module in f.keys():  # noqa
+                    module_name, _ = parse_fine_tuned_lora_name(lora_module)
+                    part_name = module_name.split(".")[-1]
+                    if part_name not in expected_lora_modules:
+                        unexpected_modules.append(module_name)
+                if unexpected_modules:
+                    raise ValueError(
+                        f"While loading {lora_dir}, expected"
+                        f" target modules in {expected_lora_modules}"
+                        f" but received {unexpected_modules}."
+                        f" Please verify that the loaded LoRA module is correct"
+                    )
+                # Load tensors if there are only expected modules.
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path):
+            # When a bin file is provided, we rely on config to find unexpected
+            # modules.
+            unexpected_modules = []
+            target_modules = config["target_modules"]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules:
+                print(unexpected_modules, "modules")
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
             tensors = torch.load(lora_bin_file_path)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
@@ -363,7 +393,7 @@ class LoRAModelManager:
 
     def __init__(
         self,
-        model: nn.Module,
+        model: SupportsLoRA,
         max_num_seqs: int,
         max_num_batched_tokens: int,
         vocab_size: int,
@@ -411,7 +441,7 @@ def __init__(
         # embeddings_indices
         self.indices_len: List[Optional[int]] = [None] * 4
 
-        self.model: nn.Module = model
+        self.model = model
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
                 self.model.supported_lora_modules)
@@ -428,7 +458,6 @@ def __init__(
         self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
-        self.model.lora_manager = self
 
     @property
     def capacity(self) -> int:
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 7ecaa450f1758..f30b2c13f28b8 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -5,13 +5,14 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.utils import get_device_capability_stateless
 
 
 def _check_punica_support():
     if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
         return
 
-    if torch.cuda.get_device_capability() < (8, 0):
+    if get_device_capability_stateless() < (8, 0):
         raise ImportError(
             "punica LoRA kernels require compute capability >= 8.0")
     else:
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 2926c7d1c8a76..1dafae503dfb2 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,9 +1,10 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name)
+    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 
 __all__ = [
     "fused_moe",
     "fused_topk",
     "fused_experts",
     "get_config_file_name",
+    "grouped_topk",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 93472eb08a462..6a976788f9b10 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index 5bd9d71e8f9bb..0a46390b2e31b 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,110 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
-        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 02e66280c1a3a..91011e64c7de4 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 0
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 34c3b593d9799..f807d4a5abaed 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 1,
-        "num_stages": 1
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4d0160ff296a0..99a5c7d78a67e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,6 +8,7 @@
 import triton
 import triton.language as tl
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 
@@ -367,6 +368,37 @@ def fused_topk(
     return topk_weights, topk_ids
 
 
+# This is used by the Deepseek-V2 model
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+):
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = scores.view(num_token, num_expert_group,
+                               -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores,
+                                        k=topk,
+                                        dim=-1,
+                                        sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -389,8 +421,12 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.float32, torch.float16, torch.bfloat16
     ]
 
-    M, _ = hidden_states.shape
+    num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
 
     if override_config:
         config = override_config
@@ -419,51 +455,74 @@ def fused_experts(hidden_states: torch.Tensor,
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, config['BLOCK_SIZE_M'], E)
     compute_type = (tl.bfloat16
                     if hidden_states.dtype == torch.bfloat16 else tl.float16)
 
-    invoke_fused_moe_kernel(hidden_states,
-                            w1,
-                            intermediate_cache1,
-                            a1_scale,
-                            w1_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            False,
-                            topk_ids.shape[1],
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
-
-    invoke_fused_moe_kernel(intermediate_cache2,
-                            w2,
-                            intermediate_cache3,
-                            a2_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            True,
-                            1,
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
     if inplace:
-        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                         dim=1,
-                         out=hidden_states)
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE:
+            # will only happen in the last chunk
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], E))
+
+        invoke_fused_moe_kernel(curr_hidden_states,
+                                w1,
+                                intermediate_cache1,
+                                a1_scale,
+                                w1_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                False,
+                                topk_ids.shape[1],
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+
+        invoke_fused_moe_kernel(intermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2_scale,
+                                w2_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                True,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                  dim=1,
+                  out=out_hidden_states[begin_chunk_idx:end_chunk_idx])
+    return out_hidden_states
 
 
 def fused_moe(
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 14f5e2378a421..7a8699e3932cb 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -95,3 +95,49 @@ def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
         return s
+
+
+class GemmaRMSNorm(CustomOp):
+    """RMS normalization for Gemma.
+
+    Two differences from the above RMSNorm:
+        1. x * (1 + w) instead of x * w.
+        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # TODO(woosuk): Implement an optimized kernel for GemmaRMSNorm.
+        return self.forward_native(x, residual)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index a18ea5601ba94..1656460df26c8 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -43,6 +43,29 @@ def adjust_bitsandbytes_shard(param: Parameter,
     return quantized_size, quantized_offset
 
 
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to 
+    one of the shards on disk. Here, we slice the param based on 
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -271,10 +294,6 @@ def __init__(self,
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         param_data = param.data
@@ -283,11 +302,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -370,33 +389,16 @@ def weight_loader(self,
         output_dim = getattr(param, "output_dim", None)
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
-
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                if needs_scalar_to_array is not None:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
@@ -457,15 +459,9 @@ def weight_loader(self,
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
 
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
 
         else:
@@ -568,32 +564,16 @@ def weight_loader(self,
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
 
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                if needs_scalar_to_array is not None:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
@@ -682,15 +662,9 @@ def weight_loader(self,
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -781,10 +755,6 @@ def __init__(self,
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
@@ -794,13 +764,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
 
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
-
-        if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 7eee599473a11..8062bfb5194bc 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -22,7 +22,8 @@ def __init__(self,
                  vocab_size: int,
                  org_vocab_size: Optional[int] = None,
                  scale: float = 1.0,
-                 logits_as_input: bool = False) -> None:
+                 logits_as_input: bool = False,
+                 soft_cap: Optional[float] = None) -> None:
         """
         Args:
             scale: A scaling factor to apply to the logits.
@@ -34,6 +35,8 @@ def __init__(self,
         self.logits_as_input = logits_as_input
         # original vocabulary size (without LoRA).
         self.org_vocab_size = org_vocab_size or vocab_size
+        # Soft cap the logits. Used in Gemma 2.
+        self.soft_cap = soft_cap
 
     def forward(
         self,
@@ -52,6 +55,11 @@ def forward(
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
         if logits is not None:
+            if self.soft_cap is not None:
+                logits = logits / self.soft_cap
+                logits = torch.tanh(logits)
+                logits = logits * self.soft_cap
+
             if self.scale != 1.0:
                 logits *= self.scale
 
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index f4fc7ce020e95..a3854f70bb4fa 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -43,7 +43,8 @@ def get_name(self) -> str:
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # The AWQ kernel only supports Turing or newer GPUs.
         return 75
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index e7de283b562a6..c23b66161d9b8 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -44,8 +44,9 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
         """List of supported activation dtypes."""
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         """Minimum GPU capability to support the quantization method.
 
         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 969958d9b5448..e76714a7b460c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -38,7 +38,7 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.float32, torch.float16, torch.bfloat16]
 
     @classmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 44dd024afe74d..491396c3da6f7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -7,12 +7,14 @@
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
+from vllm.utils import get_device_capability_stateless
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -32,9 +34,9 @@ def get_scaled_act_names(self) -> List[str]:
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
-    # Need to figure it out
-    def get_min_capability(self) -> int:
-        return 60
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
 
     def get_name(self) -> str:
         return "compressed_tensors"
@@ -82,6 +84,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
     def get_config_filenames(cls) -> List[str]:
         return []
 
+    def _check_gptq_and_marlin_can_run(self):
+        capability = get_device_capability_stateless()
+        capability = capability[0] * 10 + capability[1]
+        if capability < 80:
+            raise RuntimeError("The quantization config is not supported for ",
+                               "the current GPU. Minimum capability: 80. ",
+                               f"Current capability: {capability}.")
+
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
@@ -108,26 +118,32 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
 
         return is_8_bits and is_token and is_symmetric and is_dynamic
 
-    def _is_w4a16(self, weight_quant: BaseModel,
-                  input_quant: BaseModel) -> bool:
+    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
+                                input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_4_bits = weight_quant.num_bits == 4
         is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return is_4_bits and input_quant_none and is_symmetric and is_static
+        return (is_channel_group and input_quant_none and is_symmetric
+                and is_static)
 
     def _get_schema(self, weight_quant: BaseModel,
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
-        if self._is_w4a16(weight_quant, input_quant):
-            if self.quant_format == CompressionFormat.marlin_24.value:
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            self._check_gptq_and_marlin_can_run()
+            if (self.quant_format == CompressionFormat.marlin_24.value
+                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if self.quant_format == CompressionFormat.pack_quantized.value:
-                return CompressedTensorsW4A16(
+            if (self.quant_format == CompressionFormat.pack_quantized.value
+                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
@@ -171,6 +187,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
     def __init__(self, quantization_config: CompressedTensorsConfig):
         self.quantization_config = quantization_config
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        return layer.scheme.process_weights_after_loading(layer)
+
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 3c95aa11fc76c..f6d20ce2c6f77 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,10 +1,11 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
-from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w4a16_24 import (  # noqa: F401
-    CompressedTensorsW4A16Sparse24)
+    W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
     CompressedTensorsW8A8StaticTensor)
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS  # noqa: F401
+from .compressed_tensors_wNa16 import CompressedTensorsWNA16  # noqa: F401
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 3a5904208656e..119f6cd91bb0c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -31,3 +31,11 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
 
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 0cfac13d1ca25..f5911bc3dabb5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -18,6 +18,9 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
     in a linear transformation.
     """
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index d7e04ddb8d94a..3c07d6b6fe5c1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -11,6 +11,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_BITS = [4]
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
@@ -28,6 +29,9 @@ def __init__(self,
             raise ValueError(
                 "group_size must be given when using strategy group")
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
index efed79ec7a11c..49779057659f0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -15,70 +15,63 @@ class CompressedTensorsW8A8(CompressedTensorsScheme):
     def __init__(self, strategy: str):
         self.strategy = strategy
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+    # Cutlass kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
+    # scales being passed to the kernel), we convert to the per-channel case.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if (self.strategy == QuantizationStrategy.TENSOR
+                and len(self.logical_widths) > 1):
+
+            # Load the N per-tensor scales into the channelwise buffer.
+            weight_scale_channel = torch.empty(
+                (sum(self.logical_widths), 1),
+                dtype=torch.float32,
+                device=layer.weight_scale.device)
+            start = 0
+            for idx, logical_width in enumerate(self.logical_widths):
+                end = start + logical_width
+                weight_scale_channel[start:end, :] = layer.weight_scale[idx]
+                start = end
+
+            layer.weight_scale = Parameter(weight_scale_channel,
+                                           requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
+        self.logical_widths = output_partition_sizes
 
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(output_partition_sizes) if (
-            is_tensor_partitioned
-            or self.strategy == QuantizationStrategy.CHANNEL) else 1
-
-        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        # WEIGHT SCALE
+        shape: Union[Tuple[int], Tuple[int, int]]
         if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (weight_scale_dim, 1)
+            shape = (sum(self.logical_widths), 1)
+        else:
+            shape = (len(self.logical_widths), )
 
         weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
                                  requires_grad=False)
-
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "weight_loader": weight_loader,
+                "output_dim": 0,
+            })
+        else:
+            set_weight_attrs(weight_scale, {
+                "weight_loader": weight_loader,
+                "needs_scalar_to_array": True,
+            })
 
+        # WEIGHT
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
                                        dtype=torch.int8),
                            requires_grad=False)
-
         layer.register_parameter("weight", weight)
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "weight_loader": weight_loader,
-                "logical_widths": output_partition_sizes
-            })
-
-        # Don't need a shard_splitter for channel-wise quantization
-        # Use the default loading method
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(
-                weight_scale, {
-                    "logical_widths": output_partition_sizes,
-                    "shard_splitter": self.scales_shard_splitter,
-                })
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
similarity index 96%
rename from vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 373458cfffe04..2243260053ef5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -11,10 +11,11 @@
     marlin_permute_scales)
 from vllm.model_executor.utils import set_weight_attrs
 
-__all__ = ["CompressedTensorsW4A16"]
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_BITS = [4, 8]
 
 
-class CompressedTensorsW4A16(CompressedTensorsScheme):
+class CompressedTensorsWNA16(CompressedTensorsScheme):
 
     def __init__(self,
                  strategy: str,
@@ -28,6 +29,9 @@ def __init__(self,
             raise ValueError(
                 "group_size must be given when using strategy group")
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bbf3cde54782d..5d503a2215c96 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Module
@@ -10,7 +10,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import print_warning_once
+from vllm.utils import get_device_capability_stateless, print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -18,7 +18,7 @@
 
 
 def cutlass_fp8_supported() -> bool:
-    capability = torch.cuda.get_device_capability()
+    capability = get_device_capability_stateless()
     capability = capability[0] * 10 + capability[1]
 
     return ops.cutlass_scaled_mm_supports_fp8(capability)
@@ -111,13 +111,12 @@ def _create_scale_param(
         scale = Parameter(torch.empty(len(output_partition_sizes),
                                       dtype=torch.float32),
                           requires_grad=False)
+        scale[:] = torch.finfo(torch.float8_e4m3fn).min
         layer.register_parameter(scale_name, scale)
-        set_weight_attrs(
-            scale, {
-                **extra_weight_attrs,
-                "fp8_scales_shard_indexer":
-                self.scales_shard_indexer,
-            })
+        set_weight_attrs(scale, {
+            **extra_weight_attrs,
+            "needs_scalar_to_array": True,
+        })
 
     def create_weights(
         self,
@@ -168,22 +167,6 @@ def create_weights(
                     output_partition_sizes=output_partition_sizes,
                     **extra_weight_attrs)
 
-    def scales_shard_indexer(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-
-        if isinstance(shard_id, int):
-            pass
-        elif isinstance(shard_id, str):
-            if shard_id not in qkv_idxs:
-                raise ValueError(f"Unknown shard_id: {shard_id}")
-            shard_id = qkv_idxs[shard_id]
-        else:
-            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
-
-        return param[shard_id], loaded_weight
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if (not hasattr(layer, "process_after_load")
                 or not layer.process_after_load):
@@ -205,15 +188,27 @@ def process_weights_after_loading(self, layer: Module) -> None:
             # WEIGHT_SCALE / WEIGHT
             #   Loop over logical weights, requantizing with single scale.
             max_w_scale = layer.weight_scale.max()
-            start = 0
-            for idx, logical_width in enumerate(layer.logical_widths):
-                end = start + logical_width
-                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
-                                                  layer.weight_scale[idx])
-
-                layer.weight[start:end, :] = per_tensor_quantize(
-                    weight_dq, layer.weight_scale.max())
-                start = end
+
+            # QKV / MLP is fused in the on disk checkpoint if any of the
+            # weight scales are still set to the default since we initialize
+            # N weight scales for N shards but we only load 1 weight scale
+            # from disk in this case. As a result, we skip dequant -> requant
+            # since we already have quantized QKV together.
+            # Sample Model with fused checkpoint:
+            #   * nm-testing/Phi-3-mini-128k-instruct-FP8
+            unfused_module_in_checkpoint = (
+                layer.weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min)
+
+            if unfused_module_in_checkpoint:
+                start = 0
+                for idx, logical_width in enumerate(layer.logical_widths):
+                    end = start + logical_width
+                    weight_dq = per_tensor_dequantize(
+                        layer.weight[start:end, :], layer.weight_scale[idx])
+
+                    layer.weight[start:end, :] = per_tensor_quantize(
+                        weight_dq, layer.weight_scale.max())
+                    start = end
             layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
 
             # WEIGHT
@@ -227,10 +222,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
             if self.quant_config.activation_scheme == "dynamic":
                 layer.input_scale = None
             elif self.quant_config.activation_scheme == "static":
-                if not all_close_1d(layer.input_scale):
-                    raise ValueError(
-                        "All the input_scales for the logical weights of a "
-                        f"layer must be equal. But got {layer.input_scale}")
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
             else:
@@ -317,11 +308,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
         del layer.kv_scale
 
 
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
-
-
 def per_tensor_quantize(tensor: torch.Tensor,
                         inv_scale: Union[float, torch.Tensor]) -> torch.Tensor:
     finfo = torch.finfo(torch.float8_e4m3fn)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 599070f1550ca..c6e9279c8baea 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,6 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import get_device_capability_stateless
 
 logger = init_logger(__name__)
 
@@ -165,7 +166,7 @@ def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
             return False
 
         # If the capability of the device is too low, cannot convert.
-        major, minor = torch.cuda.get_device_capability()
+        major, minor = get_device_capability_stateless()
         device_capability = major * 10 + minor
         if device_capability < cls.get_min_capability():
             return False
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 207dbcee8afc5..72ba55eb1740d 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -39,7 +39,8 @@ def get_name(self) -> str:
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0d027d0620ab3..6f4aa2d77e680 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -12,8 +12,9 @@
     marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     get_pack_factor, quantize_weights, sort_weights)
+from vllm.utils import get_device_capability_stateless
 
-__cuda_arch = torch.cuda.get_device_capability()
+__cuda_arch = get_device_capability_stateless()
 
 MARLIN_TILE = 16
 
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index fe9b2fac1117e..e189610461a70 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -3,13 +3,12 @@
 
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
 
 
-class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
+class RejectionSampler(SpecDecodeBaseSampler):
     """Apply modified rejection sampling as described in "Accelerating Large
         Language Model Decoding with Speculative Sampling"
         https://arxiv.org/pdf/2302.01318.pdf.
@@ -28,8 +27,8 @@ def __init__(self,
             during sampling. This catches correctness issues but adds
             nontrivial latency.
         """
-        SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode)
-        nn.Module.__init__(self)
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
@@ -78,11 +77,12 @@ def forward(
             self._raise_if_incorrect_input(target_probs, bonus_token_ids,
                                            draft_probs, draft_token_ids)
 
-        accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
-            target_probs,
-            draft_probs,
-            draft_token_ids,
-        )
+        accepted, recovered_token_ids = (
+            self._batch_modified_rejection_sampling(
+                target_probs,
+                draft_probs,
+                draft_token_ids,
+            ))
 
         output_token_ids = self._create_output(
             accepted,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index a0b19046b7491..1285627ec3cc5 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -610,6 +610,129 @@ def forward(
         return query.flatten(-2), key.flatten(-2)
 
 
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
+                                self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                                self.rotary_dim, self.base,
+                                                self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         device="cuda",
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        print("Cache shape", cache.shape)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+            positions.device)
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+
+class GemmaRotaryEmbedding(RotaryEmbedding):
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
+        inv_freq = 1.0 / (base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float() /
+            self.rotary_dim))
+        return inv_freq
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
@@ -669,6 +792,19 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor, dtype,
                                                     **extra_kwargs)
+        elif scaling_type == "deepseek_yarn":
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow", "mscale", "mscale_all_dim")
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size, rotary_dim, original_max_position, base,
+                is_neox_style, scaling_factor, dtype, **extra_kwargs)
         # The correct one should be "longrope" but keep "su" here
         # for backward compatible
         elif scaling_type == "su" or scaling_type == "longrope":
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e07360a2fd682..6d00ea64f7cb8 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -679,7 +679,7 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
 
     Returns:
         torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
-                    Each element in the returned tensor represents the rank 
+                    Each element in the returned tensor represents the rank
                     of the chosen token in the input logprob tensor.
     """
     vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
@@ -965,7 +965,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
                 distribution.
             - Greedy sampling performs `argmax` to obtain the token with the
                 highest likelihood.
-    
+
     Ignoring greedy sampling for a moment, we find that the computed probability
     distribution has the following property: we can sample from it independently
     and find that the token sampled by the Sampler has a frequency corresponding
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 9856a7e7ddea0..692024056495c 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,9 +1,12 @@
+from abc import abstractmethod
 from typing import Optional
 
 import torch
+import torch.jit
+import torch.nn as nn
 
 
-class SpecDecodeBaseSampler():
+class SpecDecodeBaseSampler(nn.Module):
     """Base class for samplers used for Speculative Decoding verification
         step.
     """
@@ -51,6 +54,16 @@ def probs_dtype(self):
     def token_id_dtype(self):
         return torch.int64
 
+    @abstractmethod
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
     def _create_output(
             self,
             accepted: torch.Tensor,  # [batch_size, k]
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index f12d6a03b4d16..9bf3c84a161c5 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,12 +1,11 @@
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
 
 
-class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
+class TypicalAcceptanceSampler(SpecDecodeBaseSampler):
     """Apply typical acceptance sampling as described in section 3.3.1 in 
         "MEDUSA: Simple LLM Inference Acceleration Framework with 
         Multiple Decoding Heads"
@@ -15,10 +14,10 @@ class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
 
     def __init__(
         self,
+        posterior_threshold: float,
+        posterior_alpha: float,
         disable_bonus_tokens: bool = False,
         strict_mode: bool = False,
-        posterior_threshold: float = 0.09,
-        posterior_alpha: float = 0.3,
     ):
         """Create a Typical Acceptance Sampler.
 
@@ -31,23 +30,20 @@ def __init__(
             nontrivial latency.
             posterior_threshold : A threshold value that sets a lower bound 
             on the posterior probability of a token in target model for it
-            to be accepted. Default is 0.09
+            to be accepted.
             posterior_alpha : A scaling factor for the entropy-based
-            threshold in typical acceptance sampling. Typically defaults to
-            sqrt of posterior_threshold and is set to 0.3.
+            threshold in typical acceptance sampling.
         """
-        SpecDecodeBaseSampler.__init__(
-            self,
-            disable_bonus_tokens=disable_bonus_tokens,
-            strict_mode=strict_mode)
-        nn.Module.__init__(self)
         self._posterior_threshold = posterior_threshold
         self._posterior_alpha = posterior_alpha
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
         target_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
     ) -> torch.Tensor:
         """Sample token ids using typical acceptance sampling. This accepts 
@@ -69,6 +65,8 @@ def forward(
                 speculative tokens in a sequence are accepted.
             shape = [batch_size, num_bonus_tokens]
 
+            draft_probs: This parameter is unused by the acceptance sampler.
+
             draft_token_ids: The token ids that were sampled from the draft
                 probabilities.
             shape = [batch_size, num_speculative_tokens]
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 1a26c5c63fedc..4650b2c2458d0 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -306,11 +306,11 @@ def forward(self, input_):
                 self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
-            # Get the embeddings.
+        # Get the embeddings.
         output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a0d23ecb9effd..ce204bf4321e9 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -34,9 +34,10 @@
     get_quant_config, get_sparse_config, initialize_dummy_weights,
     np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.models.interfaces import (supports_lora,
+                                                   supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import is_tpu
+from vllm.utils import get_device_capability_stateless, is_tpu
 
 logger = init_logger(__name__)
 
@@ -47,7 +48,7 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability = torch.cuda.get_device_capability()
+        capability = get_device_capability_stateless()
         capability = capability[0] * 10 + capability[1]
         if capability < quant_config.get_min_capability():
             raise ValueError(
@@ -84,12 +85,15 @@ def _get_quantization_config(
 
 
 def _get_model_initialization_kwargs(
-        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig]
+    model_class: Type[nn.Module],
+    lora_config: Optional[LoRAConfig],
+    vlm_config: Optional[VisionLanguageConfig],
 ) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
     extra_kwargs: Dict[str, Any] = {}
-    if hasattr(model_class, "supported_lora_modules"):
+
+    if supports_lora(model_class):
+        # lora_config=None is used to disable LoRA
         extra_kwargs["lora_config"] = lora_config
     elif lora_config:
         raise ValueError(
@@ -97,13 +101,15 @@ def _get_model_initialization_kwargs(
             "but LoRA is enabled. Support for this model may "
             "be added in the future. If this is important to you, "
             "please open an issue on github.")
-    elif issubclass(model_class, VisionLanguageModelBase):
-        if vision_language_config is None:
+
+    if supports_vision(model_class):
+        if vlm_config is None:
             raise ValueError("Provide `image_input_type` and other vision "
                              "related configurations through LLM entrypoint "
                              "or engine arguments.")
 
-        extra_kwargs["vision_language_config"] = vision_language_config
+        extra_kwargs["vlm_config"] = vlm_config
+
     return extra_kwargs
 
 
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
new file mode 100644
index 0000000000000..5c522a61732a4
--- /dev/null
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -0,0 +1,210 @@
+# ruff: noqa: SIM117
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import openvino as ov
+import torch
+from huggingface_hub import HfApi
+from openvino._offline_transformations import paged_attention_transformation
+from optimum.intel import OVModelForCausalLM
+from torch import nn
+
+import vllm.envs as envs
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import DeviceConfig, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
+                                                         _prune_hidden_states)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+logger = init_logger(__name__)
+
+
+def _flattenize_inputs(inputs):
+    """
+    Helper function for making nested inputs flattens
+    """
+    flatten_inputs = []
+    for input_data in inputs:
+        if input_data is None:
+            continue
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(_flattenize_inputs(input_data))
+        elif isinstance(input_data, dict):
+            flatten_inputs.extend(_flattenize_inputs(list(
+                input_data.values())))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
+                             is_cpu: bool):
+    # Apply hardware dependent modifications to KV tensors
+    for parameter in model.get_parameters():
+        input = parameter.get_output_tensor(0)
+        input_names = input.get_names()
+        if len(input_names) != 1:
+            continue
+        input_name = next(iter(input_names))
+        shape = parameter.get_partial_shape()
+        # use real block size if available, just a placeholder
+        # to provide the expected rank
+        x_size = 1
+        num_blocks = ov.Dimension()
+        block_size = ov.Dimension()
+        head_size = ov.Dimension()
+        # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
+        # pass more parameters to this function to set more static dimensions
+        if input_name.startswith("key_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [
+                num_blocks,
+                shape[1],
+                shape[2].get_length() //
+                x_size if shape[2].is_static else ov.Dimension(),
+                block_size,
+                x_size,
+            ]
+        elif input_name.startswith("value_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
+        else:
+            continue
+        parameter.set_partial_shape(
+            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
+        parameter.set_element_type(kv_cache_dtype)
+    model.validate_nodes_and_infer_types()
+
+
+def _require_model_export(model_id, revision=None, subfolder=None):
+    model_dir = Path(model_id)
+    if subfolder is not None:
+        model_dir = model_dir / subfolder
+    if model_dir.is_dir():
+        return (not (model_dir / "openvino_model.xml").exists()
+                or not (model_dir / "openvino_model.bin").exists())
+
+    hf_api = HfApi()
+    try:
+        model_info = hf_api.model_info(model_id, revision=revision or "main")
+        normalized_subfolder = (None if subfolder is None else
+                                Path(subfolder).as_posix())
+        model_files = [
+            file.rfilename for file in model_info.siblings
+            if normalized_subfolder is None
+            or file.rfilename.startswith(normalized_subfolder)
+        ]
+        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
+                         else f"{normalized_subfolder}/openvino_model.xml")
+        return (ov_model_path not in model_files
+                or ov_model_path.replace(".xml", ".bin") not in model_files)
+    except Exception:
+        return True
+
+
+class OpenVINOCasualLM(nn.Module):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        kv_cache_dtype: ov.Type,
+    ) -> None:
+        super().__init__()
+        self.logits_processor = LogitsProcessor(
+            model_config.hf_config.vocab_size, logits_as_input=True)
+        self.sampler = Sampler()
+
+        export = _require_model_export(model_config.model)
+        if export:
+            logger.warning(
+                f"Provided model id {model_config.model} does not "  # noqa: G004
+                "contain OpenVINO IR, the model will be converted to IR with "
+                "default options. If you need to use specific options for "
+                "model conversion, use optimum-cli export openvino with "
+                "desired options.")
+        else:
+            logger.warning(
+                "OpenVINO IR is available for provided model id "  # noqa: G004
+                f"{model_config.model}. This IR will be used for inference "
+                "as-is, all possible options that may affect model conversion "
+                "are ignored.")
+
+        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+        pt_model = OVModelForCausalLM.from_pretrained(
+            model_config.model,
+            export=export,
+            compile=False,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
+        paged_attention_transformation(pt_model.model)
+        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
+                                 device_config.device.type == "cpu")
+
+        core = ov.Core()
+        ov_compiled = core.compile_model(pt_model.model, "CPU")
+        self.ov_request = ov_compiled.create_infer_request()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        attn_metadata: OpenVINOAttentionMetadata,
+    ) -> torch.Tensor:
+        flatten_kv_cache = _flattenize_inputs(kv_caches)
+
+        inputs = [
+            input_ids,
+            positions,
+            *flatten_kv_cache,
+            attn_metadata.past_lens,
+            attn_metadata.subsequence_begins,
+            attn_metadata.block_indices,
+            attn_metadata.block_indices_begins,
+            attn_metadata.max_context_len,
+        ]
+
+        self.ov_request.start_async(inputs, share_inputs=True)
+        self.ov_request.wait()
+
+        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
+
+        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
+        return logits.view(-1, logits.shape[-1])
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+
+def get_model(
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+    kv_cache_dtype: ov.Type,
+    **kwargs,
+) -> torch.nn.Module:
+    lora_config = kwargs.get("lora_config", None)
+    if lora_config:
+        raise ValueError(
+            "OpenVINO modeling does not support LoRA, "
+            "but LoRA is enabled. Support for this model may "
+            "be added in the future. If this is important to you, "
+            "please open an issue on github.")
+
+    return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
old mode 100755
new mode 100644
index 5afb2e1d44d39..69a65ff023bc9
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -21,8 +21,10 @@
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
+    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index babb92e7cdcef..5cf5a199b7690 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -45,6 +45,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
@@ -292,7 +294,7 @@ def forward(
         return hidden_states
 
 
-class BaiChuanBaseForCausalLM(nn.Module):
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [
@@ -312,14 +314,17 @@ class BaiChuanBaseForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         position_embedding: str,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = BaiChuanModel(config, position_embedding, cache_config,
                                    quant_config)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index e3a5e43e23e1c..5b5a69447e0b8 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -28,6 +28,8 @@
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 
+from .interfaces import SupportsLoRA
+
 
 class GLMAttention(nn.Module):
 
@@ -322,7 +324,7 @@ def forward(
         return hidden_states
 
 
-class ChatGLMForCausalLM(nn.Module):
+class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
@@ -345,7 +347,10 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
-        self.config: ChatGLMConfig = config
+
+        self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index aa4e87228a7e4..77fbade056ee6 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,22 +1,83 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SequenceData
 
 
-def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
-    return (image_size // patch_size)**2
+    return image_size // patch_size
+
+
+def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_clip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def dummy_seq_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_pixel_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return ImagePixelData(image)
+
+
+def dummy_feature_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
+                         dtype=torch.float16)
+    return ImageFeatureData(values)
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
@@ -39,8 +100,8 @@ def __init__(self, config: CLIPVisionConfig):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(self.image_size,
-                                                self.patch_size)
+        self.num_patches = get_clip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
@@ -101,7 +162,7 @@ def __init__(self,
         self.layer_norm2 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         residual = hidden_states
 
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index e293ee491908d..65b409a2a15a0 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -26,7 +26,7 @@
 from typing import Iterable, Optional, Tuple
 
 import torch
-from transformers import PretrainedConfig
+from transformers import LlamaConfig
 
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.model_executor.layers.quantization.base_config import (
@@ -55,7 +55,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
new file mode 100644
index 0000000000000..3d4f78c664776
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV2 model."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+
+class DeepseekV2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}.")
+
+        self.experts = nn.ModuleList([
+            DeepseekV2MLP(hidden_size=config.hidden_size,
+                          intermediate_size=config.moe_intermediate_size,
+                          hidden_act=config.hidden_act,
+                          quant_config=quant_config,
+                          reduce_results=False)
+            for idx in range(self.n_routed_experts)
+        ])
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_weights, topk_ids = grouped_topk(
+            hidden_states,
+            router_logits,
+            self.top_k,
+            renormalize=self.config.norm_topk_prob,
+            num_expert_group=self.config.n_group,
+            topk_group=self.config.topk_group)
+        final_hidden_states = fused_experts(
+            hidden_states,
+            self.w1,
+            self.w2,
+            topk_weights,
+            topk_ids,
+            inplace=True) * self.routed_scaling_factor
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(
+            final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx=None,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config)
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config)
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config)
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
+                                                   self.kv_lora_rank +
+                                                   self.qk_rope_head_dim,
+                                                   bias=False,
+                                                   quant_config=quant_config)
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config)
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+        rope_scaling['type'] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # self.attn = Attention(self.num_heads,
+        #                       self.qk_head_dim,
+        #                       self.scaling,
+        #                       num_kv_heads=self.num_heads)
+
+        # TODO, support head_size 192
+        self.attn = Attention(self.num_local_heads,
+                              256,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = DeepseekV2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            layer_idx=layer_idx,
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
+        else:
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekV2Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            DeepseekV2DecoderLayer(config,
+                                   layer_idx,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], attn_metadata,
+                                            residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV2Model(config, cache_config, quant_config)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 65f4ebec5bcf0..efefb34814c90 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -26,14 +26,14 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -41,6 +41,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 logger = init_logger(__name__)
 
 
@@ -146,12 +148,14 @@ def __init__(self,
             quant_config=quant_config,
         )
 
-        self.rotary_emb = get_rope(
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
             self.head_dim,
             rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
+            max_position_embeddings=max_position_embeddings,
             base=self.rope_theta,
             is_neox_style=True,
+            dtype=torch.get_default_dtype(),
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -202,10 +206,10 @@ def __init__(
             hidden_activation=getattr(config, "hidden_activation", None),
             quant_config=quant_config,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -255,7 +259,7 @@ def __init__(
             GemmaDecoderLayer(config, cache_config, quant_config)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
         # The normalizer's data type should be downcasted to the model's
@@ -288,7 +292,7 @@ def forward(
         return hidden_states
 
 
-class GemmaForCausalLM(nn.Module):
+class GemmaForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -319,15 +323,16 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = GemmaModel(config, cache_config, quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -384,10 +389,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # GemmaRMSNorm is different from Llama's in that it multiplies
-                # (1 + weight) to the output, instead of just weight.
-                if "norm.weight" in name:
-                    loaded_weight += 1.0
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
new file mode 100644
index 0000000000000..4e35a9ec34069
--- /dev/null
+++ b/vllm/model_executor/models/gemma2.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
+
+from .interfaces import SupportsLoRA
+
+
+class Gemma2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+
+    def __init__(self,
+                 layer_idx: int,
+                 config: Gemma2Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 rope_theta: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
+            self.head_dim,
+            self.head_dim,
+            max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        if self.config.attn_logit_softcapping is not None:
+            print_warning_once(
+                "Gemma 2 normally uses attention logit soft-capping; "
+                "soft-capping is currently incompatible with the flash "
+                "attention kernels, so vLLM removes it to enable speed and "
+                "efficiency gains of flash attention.")
+        # FIXME(woosuk): While Gemma 2 uses sliding window attention for every
+        # odd layer, vLLM currently ignores it and uses global attention for
+        # all layers.
+        use_sliding_window = (layer_idx % 2 == 1
+                              and config.sliding_window is not None)
+        del use_sliding_window  # Unused.
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        layer_idx: int,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            layer_idx=layer_idx,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+class Gemma2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            Gemma2DecoderLayer(layer_idx, config, cache_config, quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        hidden_states *= self.normalizer
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma2Model(config, cache_config, quant_config)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.model.embed_tokens.weight,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index b15ed11988c27..17bbe4e312fc3 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -41,6 +41,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class GPTBigCodeAttention(nn.Module):
 
@@ -230,7 +232,7 @@ def forward(
         return hidden_states
 
 
-class GPTBigCodeForCausalLM(nn.Module):
+class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
     supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
@@ -250,7 +252,10 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(config, cache_config, quant_config,
                                            lora_config)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
new file mode 100644
index 0000000000000..cb0fc154a74d8
--- /dev/null
+++ b/vllm/model_executor/models/interfaces.py
@@ -0,0 +1,144 @@
+from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type,
+                    Union, overload, runtime_checkable)
+
+from typing_extensions import TypeGuard
+
+from vllm.config import LoRAConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@runtime_checkable
+class SupportsVision(Protocol):
+    """The interface required for all vision language models (VLMs)."""
+
+    supports_vision: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports vision inputs.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def __init__(self, *, vlm_config: VisionLanguageConfig) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsVisionType(Protocol):
+    supports_vision: Literal[True]
+
+    def __call__(self, *, vlm_config: VisionLanguageConfig) -> None:
+        ...
+
+
+@overload
+def supports_vision(model: Type[object]) -> TypeGuard[Type[SupportsVision]]:
+    ...
+
+
+@overload
+def supports_vision(model: object) -> TypeGuard[SupportsVision]:
+    ...
+
+
+def supports_vision(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsVision]], TypeGuard[SupportsVision]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsVisionType)
+
+    return isinstance(model, SupportsVision)
+
+
+@runtime_checkable
+class SupportsLoRA(Protocol):
+    """The interface required for all models that support LoRA."""
+
+    supports_lora: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports LoRA.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    packed_modules_mapping: ClassVar[Dict[str, List[str]]]
+    supported_lora_modules: ClassVar[List[str]]
+    embedding_modules: ClassVar[Dict[str, str]]
+    embedding_padding_modules: ClassVar[List[str]]
+
+    # lora_config is None when LoRA is not enabled
+    def __init__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsLoRAType(Protocol):
+    supports_lora: Literal[True]
+
+    packed_modules_mapping: Dict[str, List[str]]
+    supported_lora_modules: List[str]
+    embedding_modules: Dict[str, str]
+    embedding_padding_modules: List[str]
+
+    def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+@overload
+def supports_lora(model: Type[object]) -> TypeGuard[Type[SupportsLoRA]]:
+    ...
+
+
+@overload
+def supports_lora(model: object) -> TypeGuard[SupportsLoRA]:
+    ...
+
+
+def supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    result = _supports_lora(model)
+
+    if not result:
+        lora_attrs = (
+            "packed_modules_mapping",
+            "supported_lora_modules",
+            "embedding_modules",
+            "embedding_padding_modules",
+        )
+        missing_attrs = tuple(attr for attr in lora_attrs
+                              if not hasattr(model, attr))
+
+        if getattr(model, "supports_lora", False):
+            if missing_attrs:
+                logger.warning(
+                    "The model (%s) sets `supports_lora=True`, "
+                    "but is missing LoRA-specific attributes: %s",
+                    model,
+                    missing_attrs,
+                )
+        else:
+            if not missing_attrs:
+                logger.warning(
+                    "The model (%s) contains all LoRA-specific attributes, "
+                    "but does not set `supports_lora=True`.", model)
+
+    return result
+
+
+def _supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsLoRAType)
+
+    return isinstance(model, SupportsLoRA)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d83ee9a201c0b..54d01701f04fb 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -49,6 +49,8 @@
 from vllm.sequence import SamplerOutput
 from vllm.utils import is_hip, print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class LlamaMLP(nn.Module):
 
@@ -296,7 +298,7 @@ def forward(
         return hidden_states
 
 
-class LlamaForCausalLM(nn.Module):
+class LlamaForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -336,7 +338,10 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = LlamaModel(config,
                                 cache_config,
                                 quant_config,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8e36c54b1c511..ba4496f9cfac5 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,10 +2,11 @@
 
 import torch
 import torch.nn as nn
-from transformers import LlavaConfig
+from transformers import CLIPVisionConfig, LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -16,11 +17,12 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
 from vllm.sequence import SamplerOutput
 
-from .vlm_base import VisionLanguageModelBase
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip)
+from .interfaces import SupportsVision
 
 _KEYS_TO_MODIFY_MAPPING = {
     "language_model.lm_head": "lm_head",
@@ -83,21 +85,48 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class LlavaForConditionalGeneration(VisionLanguageModelBase):
+def dummy_data_for_llava(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = dummy_pixel_data_for_clip(vision_config)
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = dummy_feature_data_for_clip(vision_config)
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
         self.config = config
+        self.vlm_config = vlm_config
 
-        if self.vision_language_config.image_input_type == (
+        if self.vlm_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
             self.vision_tower = CLIPVisionModel(config.vision_config)
         else:
@@ -122,11 +151,10 @@ def __init__(self,
         self.sampler = Sampler()
 
     def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != list(
-                self.vision_language_config.image_input_shape[1:]):
+        if list(data.shape[1:]) != list(self.vlm_config.image_input_shape[1:]):
             raise ValueError(
                 f"The expected image tensor shape is batch dimension plus "
-                f"{self.vision_language_config.image_input_shape[1:]}. "
+                f"{self.vlm_config.image_input_shape[1:]}. "
                 f"You supplied {data.shape}. "
                 f"If you are using vLLM's entrypoint, make sure your "
                 f"supplied image input is consistent with "
@@ -139,7 +167,7 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_features = kwargs.pop("image_features", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:
@@ -191,7 +219,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
+        image_features = vision_tower(pixel_values,
                                       self.config.vision_feature_layer)
 
         return self._select_image_features(
@@ -273,7 +301,7 @@ def forward(
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.vlm_config.image_token_id)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c1158c933c88b..2814310746715 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,14 +3,14 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import LlavaNextConfig
+from transformers import CLIPVisionConfig, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -22,11 +22,13 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.multimodal.image import ImagePixelData
+from vllm.sequence import SamplerOutput
 
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
+from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
-from .vlm_base import VisionLanguageModelBase
 
 logger = init_logger(__name__)
 
@@ -58,41 +60,118 @@ class LlavaNextImageFeatureInputs(TypedDict):
                              LlavaNextImageFeatureInputs]
 
 
-def _get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
-                                                  vlm_config)
+def _get_llava_next_num_unpadded_features(
+    height: int,
+    width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = width / height
+    current_aspect_ratio: float = current_width / current_height
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (height * current_width) // width
+        current_height = new_height
+    else:
+        new_width = (width * current_height) // height
+        current_width = new_width
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def _get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        num_patches = get_clip_patch_grid_length(
+            image_size=vision_config.image_size,
+            patch_size=vision_config.patch_size,
+        )
+        base_feature_size = num_patches * num_patches
+
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(input_height, input_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_config.image_size,
+        )
+
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
+                                                  num_patches,
+                                                  num_patch_height,
+                                                  num_patch_width)
+
+        return unpadded_feature_size + newline_feature_size + base_feature_size
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
 
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
+def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
+
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
+    image_feature_size = _get_llava_next_image_feature_size(
+        hf_config, input_height=dummy_height, input_width=dummy_width)
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = dummy_pixel_data_for_clip(
+                vision_config,
+                image_width_override=dummy_width,
+                image_height_override=dummy_height,
+            )
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = dummy_feature_data_for_clip(
+                vision_config,
+                image_feature_size_override=image_feature_size,
+            )
 
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        _, c, h, w = vlm_config.image_input_shape
-        mode = {1: "L", 3: "RGB"}[c]
-        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+        return seq_data, mm_data
 
-    return seq_data, fake_mm_data
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
 
-def _image_pixel_processor(
-    data: ImagePixelData,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Dict[str, torch.Tensor]:
+def _pixel_mapper(ctx: InputContext,
+                  data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
 
     if isinstance(image, torch.Tensor):
-        pixel_values = image.to(model_config.dtype)
+        pixel_values = image.to(ctx.model_config.dtype)
         batch_size, _, _, h, w = pixel_values.shape
         image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
 
         return {"pixel_values": pixel_values, "image_sizes": image_sizes}
 
     # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = vlm_config.image_input_shape
+    _, _, h, w = ctx.get_multimodal_config().image_input_shape
     if (w, h) != (image.width, image.height):
         logger.warning(
             "Dynamic image shape is currently not supported. "
@@ -101,24 +180,25 @@ def _image_pixel_processor(
         data.image = image.resize((w, h))
 
     return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_processor(data, model_config, vlm_config)
+        ._default_input_mapper(ctx, data)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
-@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
-class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_pixel_mapper)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
+class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaNextConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
-        # Update the type annotation from that of its superclass
         self.config = config
+        self.vlm_config = vlm_config
 
-        if self.vision_language_config.image_input_type == (
+        if self.vlm_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
             self.vision_tower = CLIPVisionModel(config=config.vision_config)
         else:
@@ -146,7 +226,7 @@ def __init__(self,
             torch.empty(config.text_config.hidden_size))
 
     def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
-        _, num_channels, _, _ = self.vision_language_config.image_input_shape
+        _, num_channels, _, _ = self.vlm_config.image_input_shape
 
         # Note that this is different from that of vLLM vision_language_config
         # since the image is resized by the HuggingFace preprocessor
@@ -177,7 +257,7 @@ def _parse_and_validate_image_input(
         image_sizes = kwargs.pop("image_sizes", None)
         image_features = kwargs.pop("image_features", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:
@@ -221,7 +301,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
+        image_features = vision_tower(pixel_values,
                                       self.config.vision_feature_layer)
 
         return self._select_image_features(
@@ -386,7 +466,7 @@ def forward(
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.vlm_config.image_token_id)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 59fbf8e1b35f2..a76ed049828e7 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -26,6 +26,7 @@
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -51,6 +52,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class MiniCPMMoE(nn.Module):
     """A tensor-parallel MoE implementation that shards each expert
@@ -388,7 +391,7 @@ def forward(
         return hidden_states
 
 
-class MiniCPMForCausalLM(nn.Module):
+class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -418,13 +421,16 @@ class MiniCPMForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.num_experts = getattr(self.config, "num_experts", 0)
         self.quant_config = quant_config
         self.model = MiniCPMModel(config,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3faf54d292b99..a662db6d28d00 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -54,6 +54,8 @@
 from vllm.sequence import SamplerOutput
 from vllm.utils import print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
@@ -472,7 +474,7 @@ def forward(
         return hidden_states
 
 
-class MixtralForCausalLM(nn.Module):
+class MixtralForCausalLM(nn.Module, SupportsLoRA):
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
@@ -504,7 +506,10 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index b18269777cd01..6e6b2d8a7edb0 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -11,6 +11,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs import MLPSpeculatorConfig
 
 
 class MLPSpeculatorLayerNorm(nn.Module):
@@ -48,7 +49,7 @@ def forward(self, x):
 
 class MLPSpeculator(nn.Module):
 
-    def __init__(self, config, **kwargs) -> None:
+    def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
         super().__init__()
         self.n_predict = config.n_predict
         self.vocab_size = config.vocab_size
@@ -56,8 +57,7 @@ def __init__(self, config, **kwargs) -> None:
         self.inner_dim = config.inner_dim if config.inner_dim != 0 \
             else config.emb_dim
 
-        self.max_speculative_tokens = getattr(config, "max_speculative_tokens",
-                                              self.n_predict)
+        self.max_speculative_tokens = config.num_lookahead_tokens
 
         self.emb = nn.ModuleList([
             VocabParallelEmbedding(config.vocab_size,
@@ -137,7 +137,8 @@ def generate_proposals(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            param = params_dict[name.replace("speculator.", "")]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            param = params_dict.get(name.replace("speculator.", ""))
+            if param is not None:
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index c8e61735a9bb6..008fceb624f75 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -39,7 +39,7 @@
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import PhiConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -59,11 +59,13 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class PhiAttention(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -131,7 +133,7 @@ def forward(
 class PhiMLP(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
@@ -160,7 +162,7 @@ def forward(self, hidden_states):
 class PhiLayer(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -192,7 +194,7 @@ def forward(
 class PhiModel(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -229,7 +231,7 @@ def forward(
         return hidden_states
 
 
-class PhiForCausalLM(nn.Module):
+class PhiForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -250,14 +252,16 @@ class PhiForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: PhiConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = PhiModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa20a7c5903d6..bc3d3f0fbf194 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,14 +13,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
+import numpy as np
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -29,12 +33,16 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput
 
+from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .interfaces import SupportsVision
+
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
@@ -60,12 +68,6 @@ def __init__(self, wte=None) -> None:
         self.type_feature: str
         self.img_processor: CLIPVisionModel
 
-    def set_img_features(self, img_features: torch.FloatTensor) -> None:
-        self.img_features = img_features
-
-    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
-        self.img_sizes = img_sizes
-
     def get_img_features(self,
                          img_embeds: torch.FloatTensor) -> torch.FloatTensor:
         LAYER_IDX = self.layer_idx
@@ -107,7 +109,6 @@ def __init__(self,
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
         self.image_dim_out = image_dim_out
-        self.img_sizes = None
 
         # global_gn and sub_gn for hd transform, serves as line separator
         self.use_hd_transform = config.embd_layer.get('use_hd_transform',
@@ -134,26 +135,20 @@ def __init__(self,
         self.img_projection = nn.Sequential(*layers)
 
         self.vocab_size = config.vocab_size
-        self.img_features = None
 
         self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
 
-    def forward(self,
-                input_ids: torch.LongTensor,
+    def forward(self, input_ids: torch.LongTensor,
                 pixel_values: torch.FloatTensor,
-                image_sizes=None) -> torch.FloatTensor:
+                image_sizes: torch.Tensor) -> torch.FloatTensor:
         """process and merge text embeddings with image embeddings."""
 
+        # (batch_size, max_num_crops, 3, height, width)
         img_embeds = pixel_values
-        img_sizes = image_sizes
 
-        if self.img_features is not None:
-            img_embeds = self.img_features.clone()
-            self.img_features = None
-
-        if self.img_sizes is not None:
-            img_sizes = self.img_sizes
+        # (batch_size, 2)
+        img_sizes = image_sizes
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
@@ -162,7 +157,6 @@ def forward(self,
 
         select = False
 
-        target_device = self.img_projection[0].bias.device
         target_dtype = self.img_projection[0].bias.dtype
 
         if len(positions.tolist()) > 0:
@@ -185,11 +179,8 @@ def forward(self,
             output_imgs = []
             output_len = []
 
-            if isinstance(img_sizes, torch.Tensor):
-                img_sizes.squeeze_(0)
-
             for _bs in range(bs):
-                h, w = img_sizes
+                h, w = img_sizes[_bs]
                 h = h // 336
                 w = w // 336
                 B_ = h * w
@@ -239,7 +230,7 @@ def forward(self,
             img_set_tensor = []
             for _output_img in output_imgs:
                 img_feature_proj = self.img_projection(
-                    _output_img.to(target_device, target_dtype))
+                    _output_img.to(target_dtype))
                 img_set_tensor.append(img_feature_proj)
             select = True
 
@@ -253,7 +244,7 @@ def forward(self,
                 hidden_states[positions[idx, 0],
                               positions[idx, 1]:positions[idx, 1] +
                               cnt] = (img_set_tensor[i].to(
-                                  hidden_states.device, hidden_states.dtype))
+                                  hidden_states.dtype))
                 idx += cnt
 
         return hidden_states.squeeze(0)
@@ -268,20 +259,113 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class Phi3VForCausalLM(VisionLanguageModelBase):
+def _get_phi3v_image_feature_size(
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    h, w = input_height, input_width
+
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
+    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+
+
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
+    image_feature_size = _get_phi3v_image_feature_size(
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+    seq_data = dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=32044,
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_pixel_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
+    )
+
+    return seq_data, mm_data
+
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    padded_width = width
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+
+    padded_width, padded_height = _calc_padded_size(width=new_width,
+                                                    height=new_height)
+
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+
+    return padded_width, padded_height
+
+
+def _image_processor(ctx: InputContext,
+                     data: ImagePixelData) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    if isinstance(image, Image.Image):
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = ctx.get_multimodal_config().image_input_shape
+        if (w, h) != _calc_hd_transform_size(width=image.width,
+                                             height=image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
+
+            data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+            ._default_input_mapper(ctx, data)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_image_processor)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
+
         self.config = config
+        self.vlm_config = vlm_config
+
         self.model = LlamaModel(config, cache_config, quant_config)
         self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vision_language_config, config, self.model.embed_tokens)
+            vlm_config, config, self.model.embed_tokens)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -291,7 +375,7 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type != ImageInputType.PIXEL_VALUES:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b5d13bb6b937c..e2d725af63593 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -48,6 +48,8 @@
 from vllm.sequence import SamplerOutput
 from vllm.utils import print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class Qwen2MLP(nn.Module):
 
@@ -263,7 +265,7 @@ def forward(
         return hidden_states
 
 
-class Qwen2ForCausalLM(nn.Module):
+class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -293,7 +295,6 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -307,7 +308,10 @@ def __init__(
                              ))
 
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = Qwen2Model(config, cache_config, quant_config)
 
diff --git a/vllm/model_executor/models/vlm_base.py b/vllm/model_executor/models/vlm_base.py
deleted file mode 100644
index eb0aa96e50d59..0000000000000
--- a/vllm/model_executor/models/vlm_base.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from torch import nn
-
-from vllm.config import VisionLanguageConfig
-
-
-class VisionLanguageModelBase(nn.Module):
-    """Base class for all vision language models (VLMs)."""
-
-    def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
-        super().__init__()
-
-        self.vision_language_config = vision_language_config
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 1e5280dde3ff9..b61721999ca9b 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -45,6 +45,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class XverseMLP(nn.Module):
 
@@ -266,7 +268,7 @@ def forward(
         return hidden_states
 
 
-class XverseForCausalLM(nn.Module):
+class XverseForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -299,10 +301,13 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        lora_config=None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = XverseModel(config, cache_config, quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 270012e7d1c3b..20bd87b8c4436 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,14 @@
 from .base import MultiModalData, MultiModalPlugin
-from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+from .registry import MultiModalRegistry
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to its modality and the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
 
 __all__ = [
     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 847752449ba80..d47cdd559ad89 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,7 +2,8 @@
 from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
                     TypeVar)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -23,7 +24,7 @@ class MultiModalData:
 
     Finally, register the new plugin to
     :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.register_input` for
+    This enables models to call :meth:`MultiModalRegistry.map_input` for
     the new modality.
     """
     pass
@@ -32,10 +33,9 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
-:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
@@ -50,16 +50,9 @@ class MultiModalPlugin(ABC, Generic[D]):
     (i.e., the modality of the data).
     """
 
-    @classmethod
-    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        return get_model_architecture(model_config)[0]
-
     def __init__(self) -> None:
-        self._input_processors: Dict[Type["nn.Module"],
-                                     MultiModalInputProcessor[D]] = {}
+        self._input_mappers: Dict[Type["nn.Module"],
+                                  MultiModalInputMapper[D]] = {}
 
     @abstractmethod
     def get_data_type(self) -> Type[D]:
@@ -70,57 +63,62 @@ def get_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_processor(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
-        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
         """
         raise NotImplementedError
 
-    def register_input_processor(self,
-                                 processor: Optional[
-                                     MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor to a model class.
+        Register an input mapper to a model class.
         
         When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided input processor is
-        applied to preprocess the data. If `None` is provided, then the default
-        input processor is applied instead.
+        this plugin (see :meth:`get_data_type`), the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        If `None` is provided, then the default input mapper is used instead.
+
+        See also:
+            :ref:`input_processing_pipeline`
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors:
+            if model_cls in self._input_mappers:
                 logger.warning(
-                    "Model class %s already has an input processor "
+                    "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._input_processors[model_cls] = processor \
-                or self._default_input_processor
+            self._input_mappers[model_cls] = mapper \
+                or self._default_input_mapper
 
             return model_cls
 
         return wrapper
 
-    def process_input(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def map_input(self, model_config: ModelConfig,
+                  data: D) -> Dict[str, "torch.Tensor"]:
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
-        
-        The model is identified by ``model_config``. ``vlm_config`` is
-        for compatibility purposes and may be merged into ``model_config``
-        in the near future.
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
+        to the model, transforming the data into a dictionary of model inputs.
+
+        The model is identified by ``model_config``.
+
+        TODO: Add guide [ref: PR #5276]
         """
-        model_cls = self.get_model_cls(model_config)
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
 
-        processor = self._input_processors.get(model_cls)
-        if processor is None:
-            raise KeyError(f"No input processor in {self} is registered for "
+        mapper = self._input_mappers.get(model_cls)
+        if mapper is None:
+            raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return processor(data, model_config, vlm_config)
+        return mapper(InputContext(model_config), data)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 08fb09d111605..a9691575c2eaf 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,70 +1,28 @@
-from typing import Dict, Tuple, Type, Union
+from functools import lru_cache
+from typing import Dict, Type, Union
 
 import torch
 from PIL import Image
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.sequence import SequenceData
-from vllm.transformers_utils.image_processor import cached_get_image_processor
+from vllm.transformers_utils.image_processor import get_image_processor
 
 from .base import MultiModalData, MultiModalPlugin
 
 logger = init_logger(__name__)
 
-
-def _get_dummy_seq_data(seq_len: int,
-                        vlm_config: VisionLanguageConfig) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
-
-    return SequenceData(token_ids)
-
-
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
-    else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
-
-
-def get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    """Standard dummy data factory for image data (to be used in
-    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
-    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
+cached_get_image_processor = lru_cache(get_image_processor)
 
 
 class ImagePixelData(MultiModalData):
     """
     The pixel data of an image. Can be one of:
 
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+    - :class:`PIL.Image.Image`: An image object. Requires that a HuggingFace
       processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+    - :class:`torch.Tensor`: The raw pixel data which is passed to the model
       without additional pre-processing.
     """
 
@@ -89,8 +47,8 @@ class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
     def get_data_type(self) -> Type[ImagePixelData]:
         return ImagePixelData
 
-    def _get_hf_image_processor(self, model_config: ModelConfig,
-                                vlm_config: VisionLanguageConfig):
+    def _get_hf_image_processor(self, model_config: ModelConfig):
+        vlm_config = model_config.multimodal_config
         if vlm_config is None or vlm_config.image_processor is None:
             return None
 
@@ -100,14 +58,13 @@ def _get_hf_image_processor(self, model_config: ModelConfig,
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_processor(
-            self, data: ImagePixelData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: ImagePixelData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image = data.image
 
         if isinstance(image, Image.Image):
-            image_processor = self._get_hf_image_processor(
-                model_config, vlm_config)
+            image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
@@ -147,9 +104,10 @@ class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
     def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
-    def _default_input_processor(
-            self, data: ImageFeatureData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(
+            self, ctx: InputContext,
+            data: ImageFeatureData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image_features = data.image_features.to(model_config.dtype)
 
         return {"image_features": image_features}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4789ce5ce4cfe..abc88e4f9a9d8 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,46 +1,35 @@
 import functools
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
-                    Tuple, Type, TypeVar)
+from typing import Any, Optional, Sequence, Type, TypeVar
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from torch import nn
+
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalData, MultiModalInputMapper, MultiModalPlugin
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
-if TYPE_CHECKING:
-    import torch
-    from torch import nn
-
-    from vllm.sequence import SequenceData
-
 logger = init_logger(__name__)
 
 D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
-
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
-MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
-                                  Tuple["SequenceData", MultiModalData]]
+N = TypeVar("N", bound=Type[nn.Module])
 
 
 class MultiModalRegistry:
     """
-    This registry is used by model runners to dispatch data processing
+    A registry to dispatch data processing
     according to its modality and the target model.
     """
 
     DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
 
-    def __init__(self,
-                 *,
-                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
-                 ) -> None:
+    def __init__(
+        self,
+        *,
+        plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS,
+    ) -> None:
         self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
-        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
-                                                  MultiModalDummyFactory] = {}
 
     def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
         data_type = plugin.get_data_type()
@@ -62,95 +51,53 @@ def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
         msg = f"Unknown multi-modal data type: {data_type}"
         raise NotImplementedError(msg)
 
-    def register_dummy_data(self, factory: MultiModalDummyFactory):
-        """
-        Register a dummy data factory to a model class.
-
-        During memory profiling, the provided function is invoked to create
-        dummy data to be inputted into the model. The modality and shape of
-        the dummy data should be an upper bound of what the model would receive
-        at inference time.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
-                logger.warning(
-                    "Model class %s already has dummy data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._dummy_factories_by_model_type[model_cls] = factory
-
-            return model_cls
-
-        return wrapper
-
-    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
-                                 vlm_config: VisionLanguageConfig):
-        """Create dummy data for memory profiling."""
-        model_cls = MultiModalPlugin.get_model_cls(model_config)
-        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
-        if dummy_factory is None:
-            msg = f"No dummy data defined for model class: {model_cls}"
-            raise NotImplementedError(msg)
-
-        return dummy_factory(seq_len, model_config, vlm_config)
-
-    def register_input(
-            self,
-            data_type: Type[D],
-            processor: Optional[MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        data_type: Type[D],
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor for a specific modality to a model class.
+        Register an input mapper for a specific modality to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
         return self._get_plugin_for_data_type(data_type) \
-            .register_input_processor(processor)
+            .register_input_mapper(mapper)
 
-    def register_image_pixel_input(
-            self,
-            processor: Optional[
-                MultiModalInputProcessor[ImagePixelData]] = None):
+    def register_image_pixel_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[ImagePixelData]] = None,
+    ):
         """
-        Register an input processor for image pixel data to a model class.
+        Register an input mapper for image pixel data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImagePixelData, processor)
+        return self.register_input_mapper(ImagePixelData, mapper)
 
-    def register_image_feature_input(
+    def register_image_feature_input_mapper(
         self,
-        processor: Optional[
-            MultiModalInputProcessor[ImageFeatureData]] = None):
+        mapper: Optional[MultiModalInputMapper[ImageFeatureData]] = None,
+    ):
         """
-        Register an input processor for image feature data to a model class.
+        Register an input mapper for image feature data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImageFeatureData, processor)
+        return self.register_input_mapper(ImageFeatureData, mapper)
 
-    def process_input(self, data: MultiModalData, model_config: ModelConfig,
-                      vlm_config: VisionLanguageConfig):
+    def map_input(self, model_config: ModelConfig, data: MultiModalData):
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
         
-        See :meth:`MultiModalPlugin.process_input` for more details.
+        See :meth:`MultiModalPlugin.map_input` for more details.
         """
         return self._get_plugin_for_data_type(type(data)) \
-            .process_input(data, model_config, vlm_config)
+            .map_input(model_config, data)
 
-    def create_input_processor(self, model_config: ModelConfig,
-                               vlm_config: VisionLanguageConfig):
+    def create_input_mapper(self, model_config: ModelConfig):
         """
-        Create an input processor (see :meth:`process_input`) for a
-        specific model.
+        Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return functools.partial(self.process_input,
-                                 model_config=model_config,
-                                 vlm_config=vlm_config)
-
-
-MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
+        return functools.partial(self.map_input, model_config)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 509f791d27c6f..0cf2c057f892c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,7 @@
 import base64
 from io import BytesIO
 from typing import Optional, Union
+from urllib.parse import urlparse
 
 import aiohttp
 from PIL import Image
@@ -28,6 +29,10 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
         """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
         if image_url.startswith('http'):
+            parsed_url = urlparse(image_url)
+            if parsed_url.scheme not in ["http", "https"]:
+                raise ValueError("Invalid 'image_url': A valid 'image_url' "
+                                 "must have scheme 'http' or 'https'.")
             # Avoid circular import
             from vllm import __version__ as VLLM_VERSION
 
@@ -44,8 +49,9 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
             image = load_image_from_base64(image_url.split(',', 1)[1])
 
         else:
-            raise ValueError("Invalid image url: A valid image url must start "
-                             "with either 'data:image' or 'http'.")
+            raise ValueError(
+                "Invalid 'image_url': A valid 'image_url' must start "
+                "with either 'data:image' or 'http'.")
 
         return image
 
@@ -56,7 +62,7 @@ async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
 
 
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
-    """encode image to base64 format."""
+    """Encode a pillow image to base64 format."""
 
     buffered = BytesIO()
     if format == 'JPEG':
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9d8a361353e26..a2caae21a86e3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -280,17 +280,30 @@ def _verify_greedy_sampling(self) -> None:
                              f"Got {self.best_of}.")
 
     def update_from_generation_config(
-            self, generation_config: Dict[str, Any]) -> None:
+            self,
+            generation_config: Dict[str, Any],
+            model_eos_token_id: Optional[int] = None) -> None:
         """Update if there are non-default values from generation_config"""
+
+        if model_eos_token_id is not None:
+            # Add the eos token id into the sampling_params to support
+            # min_tokens processing.
+            self.all_stop_token_ids.add(model_eos_token_id)
+
         # Update eos_token_id for generation
-        if (not self.ignore_eos) and (eos_ids :=
-                                      generation_config.get("eos_token_id")):
+        if (eos_ids := generation_config.get("eos_token_id")) is not None:
             # it can be either int or list of int
-            if isinstance(eos_ids, int):
-                eos_ids = [eos_ids]
-            original_stop_token_ids = set(self.stop_token_ids)
-            original_stop_token_ids.update(eos_ids)
-            self.stop_token_ids = list(original_stop_token_ids)
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+            if model_eos_token_id is not None:
+                # We don't need to include the primary eos_token_id in
+                # stop_token_ids since it's handled separately for stopping
+                # purposes.
+                eos_ids.discard(model_eos_token_id)
+            if eos_ids:
+                self.all_stop_token_ids.update(eos_ids)
+                if not self.ignore_eos:
+                    eos_ids.update(self.stop_token_ids)
+                    self.stop_token_ids = list(eos_ids)
 
     @cached_property
     def sampling_type(self) -> SamplingType:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 287e1b9df6165..22cb26dc08ef7 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,19 +1,19 @@
 """Sequence and its related classes."""
 import copy
 import enum
+import math
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
-from vllm.block import LogicalTokenBlock
-from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.inputs import LLMInputs
     from vllm.multimodal import MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
@@ -39,24 +39,21 @@ class Logprob:
 SampleLogprobs = List[Dict[int, Logprob]]
 
 
-class SequenceStatus(enum.Enum):
+class SequenceStatus(enum.IntEnum):
     """Status of a sequence."""
-    WAITING = enum.auto()
-    RUNNING = enum.auto()
-    SWAPPED = enum.auto()
-    FINISHED_STOPPED = enum.auto()
-    FINISHED_LENGTH_CAPPED = enum.auto()
-    FINISHED_ABORTED = enum.auto()
-    FINISHED_IGNORED = enum.auto()
+    WAITING = 0
+    RUNNING = 1
+    SWAPPED = 2
+    # Note: anything after SWAPPED (2) will be considered
+    # as a finished status.
+    FINISHED_STOPPED = 3
+    FINISHED_LENGTH_CAPPED = 4
+    FINISHED_ABORTED = 5
+    FINISHED_IGNORED = 6
 
     @staticmethod
     def is_finished(status: "SequenceStatus") -> bool:
-        return status in [
-            SequenceStatus.FINISHED_STOPPED,
-            SequenceStatus.FINISHED_LENGTH_CAPPED,
-            SequenceStatus.FINISHED_ABORTED,
-            SequenceStatus.FINISHED_IGNORED,
-        ]
+        return status > SequenceStatus.SWAPPED
 
     @staticmethod
     def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
@@ -221,7 +218,7 @@ class Sequence:
     def __init__(
         self,
         seq_id: int,
-        inputs: LLMInputs,
+        inputs: "LLMInputs",
         block_size: int,
         eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -236,9 +233,6 @@ def __init__(
         self.output_logprobs: SampleLogprobs = []
         self.output_text = ""
 
-        self.logical_token_blocks: List[LogicalTokenBlock] = []
-        # Initialize the logical token blocks with the prompt token ids.
-        self._append_tokens_to_blocks(self.prompt_token_ids)
         self.status = SequenceStatus.WAITING
         self.stop_reason: Union[int, str, None] = None
 
@@ -248,6 +242,10 @@ def __init__(
         # Input + output tokens
         self.tokens: Optional[List[str]] = None
 
+    @property
+    def n_blocks(self) -> int:
+        return math.ceil(self.get_len() / self.block_size)
+
     @property
     def prompt(self) -> Optional[str]:
         return self.inputs.get("prompt")
@@ -287,36 +285,12 @@ def reset_state_for_recompute(self):
         """Reset the sequence states for recomputation."""
         self.data.reset_state_for_recompute()
 
-    def _append_logical_block(self) -> None:
-        block = LogicalTokenBlock(
-            block_number=len(self.logical_token_blocks),
-            block_size=self.block_size,
-        )
-        self.logical_token_blocks.append(block)
-
-    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
-        cursor = 0
-        while cursor < len(token_ids):
-            if not self.logical_token_blocks:
-                self._append_logical_block()
-
-            last_block = self.logical_token_blocks[-1]
-            if last_block.is_full():
-                self._append_logical_block()
-                last_block = self.logical_token_blocks[-1]
-
-            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append_tokens(token_ids[cursor:cursor +
-                                               num_empty_slots])
-            cursor += num_empty_slots
-
     def append_token_id(
         self,
         token_id: int,
         logprobs: Dict[int, Logprob],
     ) -> None:
         assert token_id in logprobs
-        self._append_tokens_to_blocks([token_id])
         self.output_logprobs.append(logprobs)
         self.data.append_token_id(token_id, logprobs[token_id].logprob)
 
@@ -388,7 +362,7 @@ def is_prefill(self) -> bool:
     def __repr__(self) -> str:
         return (f"Sequence(seq_id={self.seq_id}, "
                 f"status={self.status.name}, "
-                f"num_blocks={len(self.logical_token_blocks)})")
+                f"num_blocks={self.n_blocks}, ")
 
 
 @dataclass
@@ -887,7 +861,8 @@ def prune(self,
 
 @dataclass
 class ExecuteModelRequest:
-    """The model execution request."""
+    """The model execution request, containing CPU metadata only. The LLM
+    engine should create an instance of this class for each request batch."""
     # The sequence group metadata list.
     seq_group_metadata_list: List[SequenceGroupMetadata]
     # Blocks to swap in. List of CPU -> GPU block number.
@@ -902,6 +877,8 @@ class ExecuteModelRequest:
     running_queue_size: int = 0
     # Optional hidden states from prior step.
     previous_hidden_states: Optional[HiddenStates] = None
+    # The number of forward steps to run.
+    num_steps: int = 1
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -915,4 +892,5 @@ def clone(
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
+            num_steps=self.num_steps,
         )
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
new file mode 100644
index 0000000000000..f30d29376121a
--- /dev/null
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -0,0 +1,170 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
+                                      ModelRunner)
+
+logger = init_logger(__name__)
+
+
+class TP1DraftModelRunner(ModelRunner):
+    """Specialized model runner for speculative decoding draft model.
+    Since the draft model always execute k forward passes consecutively to
+    generate k speculative tokens in a single speculative decoding step,
+    we could get rid of most CPU-GPU synchronization and data transfer
+    overheads by keeping model input and output tensors on GPU all the time.
+
+    This runner is still under development so there's no performance gain
+    at this moment. Currently we adopt a temporary solution that caches the
+    seq_group_metadata_list for multi-step execution, so that we can
+    leverage existing prepare_model_input to be compatible with the current
+    execution flow, but we plan to remove this cache and avoid calling
+    prepare_model_input in execute_model at all.
+    
+    The detail development plan includes:
+    1. Use "update_model_input" to update existing model_input without
+       creating a new one.
+    2. Improve the performance of "update_model_input" with a GPU kernel.
+    3. Support TP > 1 (this requires some designs because we do not expect
+       any broadcasting inside execute_model).
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        return_hidden_states: bool = False,
+    ):
+        if return_hidden_states:
+            raise ValueError(
+                "return_hidden_states is not supported for TP1DraftModelRunner."
+            )
+
+        super().__init__(
+            model_config=model_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            cache_config=cache_config,
+            load_config=load_config,
+            lora_config=lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config,
+            return_hidden_states=return_hidden_states,
+        )
+
+        # TODO: Remove this cache when we are able to update model_input
+        # directly in advance_step.
+        self.cached_seq_group_metadata_list: Optional[
+            List[SequenceGroupMetadata]] = None
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """A temporary solution that caches the seq_group_metadata_list
+        for multi-step execution.
+        TODO: In-place update model_input and remove this function.
+        """
+        self.cached_seq_group_metadata_list = seq_group_metadata_list
+        return super().prepare_model_input(seq_group_metadata_list)
+
+    def update_model_input(
+            self, model_input: ModelInputForGPUWithSamplingMetadata,
+            last_output: SamplerOutput
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model inputs for the next step.
+        TODO: In-place update model_input instead of calling
+        prepare_model_input.
+        """
+
+        # Append the output token to the sequence data.
+        assert self.cached_seq_group_metadata_list is not None
+        for seq_group_metadata, sequence_group_outputs in zip(
+                self.cached_seq_group_metadata_list, last_output.outputs):
+            seq_group_metadata.is_prompt = False
+
+            for seq_output in sequence_group_outputs.samples:
+                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
+
+                token_id = seq_output.output_token
+                token_logprob = seq_output.logprobs[token_id]
+
+                seq.append_token_id(token_id, token_logprob.logprob)
+                seq.update_num_computed_tokens(1)
+
+        return self.prepare_model_input(self.cached_seq_group_metadata_list)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        # Since we do not broadcast data inside execute_model anymore,
+        # we need to figure out the best way to support TP > 1 in this
+        # case, because we will at least need to broadcast the sampled
+        # tokens to all workers.
+        if not self.is_driver_worker:
+            raise ValueError("TP1DraftModelRunner only supports TP=1.")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        outputs: List[SamplerOutput] = []
+        for step in range(num_steps):
+            # Currently cuda graph is only supported by the decode phase.
+            assert model_input.attn_metadata is not None
+            prefill_meta = model_input.attn_metadata.prefill_metadata
+            decode_meta = model_input.attn_metadata.decode_metadata
+            if prefill_meta is None and decode_meta.use_cuda_graph:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = self.graph_runners[graph_batch_size]
+            else:
+                model_executable = self.model
+
+            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+            hidden_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                **multi_modal_kwargs,
+            )
+
+            # Compute the logits.
+            logits = self.model.compute_logits(hidden_states,
+                                               model_input.sampling_metadata)
+
+            # Sample the next token.
+            outputs.append(
+                self.model.sample(
+                    logits=logits,
+                    sampling_metadata=model_input.sampling_metadata,
+                ))
+
+            # Prepare the inputs for the next step.
+            if step != num_steps - 1:
+                model_input = self.update_model_input(model_input, outputs[-1])
+
+        return outputs
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index ab1d96c558de7..2c4ae0b22744b 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
 from vllm.utils import is_pin_memory_available
 
 
@@ -46,15 +47,15 @@ class SpecDecodeWorkerMetrics:
 
 
 class AsyncMetricsCollector:
-    """Class which copies rejection sampler metrics from the device to CPU on a
-    non-default Torch stream.
+    """Class which copies rejection/typical-acceptance sampler metrics
+    from the device to CPU on a non-default Torch stream.
     """
 
     def __init__(self,
-                 rejection_sampler: RejectionSampler,
+                 spec_decode_sampler: SpecDecodeBaseSampler,
                  timer: Optional[Timer] = None,
                  collect_interval_s: float = 5.0):
-        self._rejection_sampler = rejection_sampler
+        self.spec_decode_sampler = spec_decode_sampler
         self._timer = time.time if timer is None else timer
 
         self._rank: Optional[int] = None
@@ -95,7 +96,7 @@ def maybe_collect_rejsample_metrics(
         return None
 
     def _should_collect_rejsample_metrics(self, now: float) -> bool:
-        """Return whether or not this iteration should print rejection sampling
+        """Return whether or not this iteration should print sampling
         metrics.
         """
         if self._rank != 0:
@@ -107,8 +108,8 @@ def _should_collect_rejsample_metrics(self, now: float) -> bool:
         return True
 
     def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
-        """Copy rejection sampling metrics (number of accepted tokens, etc) to
-        CPU asynchronously.
+        """Copy rejection/typical-acceptance sampling metrics 
+        (number of accepted tokens, etc) to CPU asynchronously.
 
         Returns a CUDA event recording when the copy is complete.
         """
@@ -117,13 +118,14 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
 
         with torch.cuda.stream(self._copy_stream):
             self._aggregate_num_accepted_tokens.copy_(
-                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_accepted_tokens,
+                non_blocking=True)
             self._aggregate_num_emitted_tokens.copy_(
-                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_emitted_tokens, non_blocking=True)
             # Number of draft tokens is calculated on CPU, so no copy is
             # required.
             self._aggregate_num_draft_tokens = (
-                self._rejection_sampler.num_draft_tokens)
+                self.spec_decode_sampler.num_draft_tokens)
 
         aggregate_metrics_ready = torch.cuda.Event()
         aggregate_metrics_ready.record(self._copy_stream)
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 0926e13bedab1..6c1c8da57d188 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -7,7 +7,6 @@
                            SequenceGroupMetadata)
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
-from vllm.worker.model_runner import ModelInput
 
 
 class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
@@ -56,7 +55,7 @@ def _prepare_input_tensors(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, List[int], List[int]]:
         if not seq_group_metadata_list:
-            return ModelInput.empty(self.device)
+            return torch.empty(0, device=self.device), [], []
 
         input_tokens: List[int] = []
         seq_lens: List[int] = []
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 668ceefe6175f..c1a02e1d32e85 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,7 +6,9 @@
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -28,9 +30,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # Lazy initialization list.
-        self._proposer: Top1Proposer
+        self._proposer: SpeculativeProposer
 
-    def init_device(self):
+    def init_device(self) -> None:
         super().init_device()
 
         self._proposer = Top1Proposer(
@@ -40,7 +42,7 @@ def init_device(self):
             max_proposal_len=self.max_model_len,
         )
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         # Need include_gpu_probs_tensor for multi_step_worker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
@@ -66,22 +68,24 @@ def sampler_output(
         copied_execute_model_req = execute_model_req.clone(
             copied_seq_group_metadata_list)
 
-        # Assert enough KV space for sample_len tokens per sequence.
-        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
-                                     sample_len)
-
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
-        for _ in range(sample_len):
-            model_output = super().execute_model(
+        if isinstance(self.model_runner, TP1DraftModelRunner):
+            copied_execute_model_req.num_steps = sample_len
+            model_outputs = self.execute_model(
                 execute_model_req=copied_execute_model_req)
-            assert (len(model_output) == 1
-                    ), "composing multistep workers not supported"
-            model_output = model_output[0]
-
-            self._append_new_tokens(model_output,
-                                    copied_seq_group_metadata_list)
-            model_outputs.append(model_output)
+        else:
+            # TODO: Remove this branch once DraftModelRunner supports TP>1.
+            for _ in range(sample_len):
+                model_output: List[SamplerOutput] = super().execute_model(
+                    execute_model_req=copied_execute_model_req)
+                assert (len(model_output) == 1
+                        ), "composing multistep workers not supported"
+                model_output = model_output[0]
+
+                self._append_new_tokens(model_output,
+                                        copied_seq_group_metadata_list)
+                model_outputs.append(model_output)
 
         return model_outputs, True
 
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index fd67ceb912eee..b691659fb292b 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -3,10 +3,10 @@
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposer
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class ProposerWorkerBase(WorkerBase, SpeculativeProposer):
+class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
     """Interface for proposer workers"""
 
     @abstractmethod
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
new file mode 100644
index 0000000000000..b78e4489513f7
--- /dev/null
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -0,0 +1,149 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_model_parallel_group,
+                                             patch_tensor_parallel_group)
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+
+logger = init_logger(__name__)
+
+
+class SmallerTpProposerWorker(ProposerWorkerBase):
+    """Class which allows a speculative draft model to run with smaller tensor
+    parallel degree than target model.
+    This reduces the communication overhead of small draft models.
+
+    To implement this feature, this class differs behavior based on is_dummy
+    flag, where dummy means worker that does not participate draft generation.
+    Participating workers use a smaller tp group by patching vLLM's tensor
+    parallel group temporarily during forward passes of draft models.
+    """
+
+    @classmethod
+    def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
+                          target_tensor_parallel_size: int):
+        """Wrap the worker in a SmallerTpProposerWorker if necessary.
+        """
+        if draft_tensor_parallel_size == target_tensor_parallel_size:
+            return worker
+
+        # gpu ranks that will generate draft tokens together
+        draft_ranks = list(range(draft_tensor_parallel_size))
+
+        logger.info("Wrapping {%s} in {%s}", type(worker), cls)
+        return cls(worker, draft_ranks)
+
+    def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
+        """Create a SmallerTpProposerWorker.
+
+        Args:
+            worker (MultiStepWorker): an actual worker wrapped with this class
+            draft_ranks (List[int]): if this value is given, only the GPU ranks
+            written in this value participate in draft generation
+        """
+        self._worker = worker
+        self._draft_ranks = draft_ranks
+
+        # init during init_device
+        self._is_dummy = False
+        self._tp_group = None
+
+    def _patch_tensor_parallel_group(self):
+        """Temporarily patch the global tp group state with its own tp group
+        state.
+        """
+        return patch_tensor_parallel_group(self._tp_group)
+
+    def init_device(self) -> None:
+        self._is_dummy = get_tp_group().rank not in self._draft_ranks
+
+        # dummy workers do nothing
+        if self._is_dummy:
+            return
+
+        # creates tp process group containing only a subset of gpu ranks
+        local_rank = get_tp_group().local_rank
+        tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
+        self._tp_group = init_model_parallel_group([self._draft_ranks],
+                                                   local_rank, tp_backend)
+
+        with self._patch_tensor_parallel_group():
+            self._worker.init_device()
+
+    def set_include_gpu_probs_tensor(self) -> None:
+        if self._is_dummy:
+            return
+
+        # Need include_gpu_probs_tensor for multi_step_worker
+        self._worker.set_include_gpu_probs_tensor()
+
+    def load_model(self) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        if self._is_dummy:
+            # this case is not used now
+            return -1, -1
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        # Do not check _is_dummy, as it's always called by get_spec_proposals
+        return self._worker.sampler_output(execute_model_req, sample_len)
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+        if self._is_dummy:
+            return SpeculativeProposals(None, None, None)
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_spec_proposals(execute_model_req)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if self._is_dummy:
+            return []
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        if self._is_dummy:
+            # by returning zero, target worker can use the entire kv cache space
+            return 0
+
+        return self._worker.get_cache_block_size_bytes()
+
+    @property
+    def vocab_size(self) -> int:
+        return self._worker.vocab_size
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 58d3461a25188..ca470bee21c91 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -3,14 +3,19 @@
 
 import torch
 
-from vllm.config import SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SamplerOutput, SequenceGroupMetadata,
                            get_all_seq_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
@@ -18,6 +23,7 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
 from vllm.spec_decode.util import (create_sequence_group_output,
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
@@ -54,7 +60,12 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         draft_worker_kwargs=draft_worker_kwargs,
         disable_by_batch_size=speculative_config.
         speculative_disable_by_batch_size,
-    )
+        draft_token_acceptance_method=speculative_config.
+        draft_token_acceptance_method,
+        typical_acceptance_sampler_posterior_threshold=speculative_config.
+        typical_acceptance_sampler_posterior_threshold,
+        typical_acceptance_sampler_posterior_alpha=speculative_config.
+        typical_acceptance_sampler_posterior_alpha)
 
     return spec_decode_worker
 
@@ -76,8 +87,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         welcome!).
     * Only top-1 proposal and scoring are implemented. Tree-attention is left as
         future work.
-    * Only lossless rejection sampling is supported. Contributions adding lossy
-        verification routines are welcome (e.g. Medusa's typical acceptance).
     * All sequences in a batch must have the same proposal length, or zero. This
         can be improved by having per-sequence speculation in the future.
     * The scoring forward pass is done without an MQA kernel, which is
@@ -90,9 +99,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: WorkerBase,
+        scorer_worker: Worker,
         draft_worker_kwargs: Dict[str, Any],
         disable_by_batch_size: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
     ) -> "SpecDecodeWorker":
 
         ngram_prompt_lookup_max = (
@@ -111,22 +123,44 @@ def create_worker(
             proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
             disable_bonus_tokens = False
         else:
+            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+                'parallel_config']
+            draft_tp = draft_parallel_config.tensor_parallel_size
+            target_tp = scorer_worker.parallel_config.tensor_parallel_size
+
+            if draft_tp == 1:
+                draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+            proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
+                proposer_worker, draft_tp, target_tp)
 
         logger.info("Configuring SpecDecodeWorker with proposer=%s",
                     type(proposer_worker))
 
+        spec_decode_sampler: SpecDecodeBaseSampler = None
+        if draft_token_acceptance_method == "rejection_sampler":
+            spec_decode_sampler = RejectionSampler(
+                disable_bonus_tokens=disable_bonus_tokens, )
+        elif draft_token_acceptance_method == "typical_acceptance_sampler":
+            spec_decode_sampler = TypicalAcceptanceSampler(
+                disable_bonus_tokens=disable_bonus_tokens,
+                posterior_threshold=\
+                    typical_acceptance_sampler_posterior_threshold,
+                posterior_alpha=typical_acceptance_sampler_posterior_alpha,
+            )
+        logger.info("Configuring SpecDecodeWorker with sampler=%s",
+                    type(spec_decode_sampler))
+
         return SpecDecodeWorker(proposer_worker,
                                 scorer_worker,
                                 disable_by_batch_size=disable_by_batch_size,
-                                rejection_sampler=RejectionSampler(
-                                    disable_bonus_tokens=disable_bonus_tokens))
+                                spec_decode_sampler=spec_decode_sampler)
 
     def __init__(
         self,
         proposer_worker: ProposerWorkerBase,
         scorer_worker: WorkerBase,
-        rejection_sampler: RejectionSampler,
+        spec_decode_sampler: SpecDecodeBaseSampler,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
         disable_by_batch_size: Optional[int] = None,
     ):
@@ -139,8 +173,12 @@ def __init__(
             scorer_worker: A worker that produces probabilities of speculative
                 tokens according to some base model. Typically a vanilla vLLM
                 Worker.
-            rejection_sampler: A Torch module used to perform modified rejection
-                sampling for speculative decoding.
+            spec_decode_sampler: A Torch module used to perform acceptance
+                sampling of the draft tokens in the verification step of
+                speculative decoding. Currently we support two different 
+                types of sampler namely RejectionSampler and
+                TypicalAcceptanceSampler. 'spec_decode_sampler' is either an
+                instance of RejectionSampler or TypicalAcceptanceSampler.
             disable_by_batch_size: If the batch size is larger than this,
                 disable speculative decoding for new incoming requests.
             metrics_collector: Helper class for collecting metrics; can be set
@@ -149,15 +187,12 @@ def __init__(
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
-        self.rejection_sampler = rejection_sampler
-
+        self.spec_decode_sampler = spec_decode_sampler
         self._metrics = AsyncMetricsCollector(
-            rejection_sampler
+            self.spec_decode_sampler
         ) if metrics_collector is None else metrics_collector
-
-        self.probs_dtype = self.rejection_sampler.probs_dtype
-        self.token_id_dtype = self.rejection_sampler.token_id_dtype
-
+        self.probs_dtype = self.spec_decode_sampler.probs_dtype
+        self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
         # Lazy initiazliation.
         self.scorer: SpeculativeScorer
 
@@ -178,7 +213,8 @@ def init_device(self) -> None:
         self.proposer_worker.load_model()
 
         self._metrics.init_gpu_tensors(self.rank)
-        self.rejection_sampler.init_gpu_tensors(self.rank)
+        self.spec_decode_sampler.init_gpu_tensors(self.rank)
+
         self.scorer = BatchExpansionTop1Scorer(
             scorer_worker=self.scorer_worker,
             device=self.device,
@@ -192,7 +228,7 @@ def load_model(self, *args, **kwargs):
     def _configure_model_sampler_for_spec_decode(self):
         """Configure model sampler to emit GPU tensors. This allows spec decode
         to keep data on device without transferring to CPU and serializing,
-        which significantly reduces overhead of rejection sampling.
+        which significantly reduces overhead of sampling during verification.
 
         NOTE(cade): This breaks abstraction boundaries pretty badly. The better
         design is to have the "move to CPU and serialize" sampling decision be
@@ -470,7 +506,7 @@ def _verify_tokens(
         # Get proposed tokens.
         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
 
-        accepted_token_ids = self.rejection_sampler(
+        accepted_token_ids = self.spec_decode_sampler(
             target_probs=proposal_verifier_probs,
             bonus_token_ids=bonus_token_ids,
             draft_probs=proposal_probs,
@@ -485,7 +521,6 @@ def _verify_tokens(
         accepted_token_ids = torch.cat(
             [accepted_token_ids, non_spec_token_ids])
         logprobs = proposal_scores.logprobs
-
         # Rearrange so that results are in the order of the original seq group
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 60fc756a12e3d..5e2fe116db9c6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,7 +1,7 @@
 import contextlib
 from typing import Dict, Optional, Type
 
-from transformers import PretrainedConfig
+from transformers import GenerationConfig, PretrainedConfig
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
@@ -80,3 +80,25 @@ def get_hf_text_config(config: PretrainedConfig):
         return config.text_config
     else:
         return config
+
+
+def try_get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[GenerationConfig]:
+    try:
+        return GenerationConfig.from_pretrained(
+            model,
+            revision=revision,
+        )
+    except OSError:  # Not found
+        try:
+            config = get_config(
+                model,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+            )
+            return GenerationConfig.from_model_config(config)
+        except OSError:  # Not found
+            return None
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index dd1d92b861b81..e1c1f4a960128 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -35,6 +35,7 @@ def __init__(self,
                 candidate tree.
                 For each candidate branch in the tree, head n produces topk[n]
                 additional sub-branches.
+                NOTE: This parameter is currently unused.
             n_candidates: int
                 number of child candidates to create per sequence
         """
@@ -47,4 +48,6 @@ def __init__(self,
         self.n_predict = n_predict
         self.top_k_tokens_per_head = top_k_tokens_per_head
         self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3239b1d0cfa2f..2bb5215d4846f 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,4 +1,3 @@
-from functools import lru_cache
 from typing import Optional
 
 from transformers import AutoImageProcessor
@@ -40,6 +39,3 @@ def get_image_processor(
             raise e
 
     return processor
-
-
-cached_get_image_processor = lru_cache(get_image_processor)
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 3cce96e06d1a0..18fbd894f1c0e 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -53,3 +53,7 @@ async def get_lora_tokenizer_async(
     ) -> "PreTrainedTokenizer":
         """Get a tokenizer for a LoRA request."""
         pass
+
+    def check_health(self):
+        """Raise exception if the tokenizer group is unhealthy."""
+        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 7c605416854b8..799ca7d3f15c0 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -2,17 +2,25 @@
 import os
 from typing import List, Optional
 
+try:
+    from ray.exceptions import ActorDiedError
+except ImportError:
+    # For older versions of Ray
+    from ray.exceptions import RayActorError as ActorDiedError
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
     TokenizerGroup)
 
+logger = init_logger(__name__)
+
 
 class RayTokenizerGroupPool(BaseTokenizerGroup):
     """A Ray-based pool of TokenizerGroups for async tokenization."""
@@ -46,24 +54,28 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
                  ray_actor_options: dict, **tokenizer_config):
         # Store a local copy of the TokenizerGroup for quick access
         # to underlying HF tokenizers.
+        self._tokenizer_config = {
+            "tokenizer_id": tokenizer_id,
+            "enable_lora": enable_lora,
+            "max_num_seqs": max_num_seqs,
+            "max_input_length": max_input_length,
+            **tokenizer_config
+        }
         self._local_tokenizer_group = self._worker_cls(
-            tokenizer_id=tokenizer_id,
-            enable_lora=enable_lora,
-            max_num_seqs=max_num_seqs,
-            max_input_length=max_input_length,
-            **tokenizer_config,
-        )
-
-        ray_tokenizer_group_cls = ray.remote(
+            **self._tokenizer_config, )
+
+        self._ray_tokenizer_group_cls = ray.remote(
             self._worker_cls).options(**ray_actor_options)
-        self.tokenizer_actors = [
-            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
-                                           max_num_seqs, max_input_length,
-                                           **tokenizer_config)
-            for _ in range(num_actors)
-        ]
+        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
+        # If set, actor is unhealthy. Will reraise on the next
+        # check_health call.
+        self._exception: Optional[ActorDiedError] = None
+
+    def _init_actor(self) -> ray.ObjectRef:
+        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
+
     @property
     def pool_size(self) -> int:
         return len(self.tokenizer_actors)
@@ -78,6 +90,22 @@ def _ensure_queue_initialized(self):
             for actor in self.tokenizer_actors:
                 self._idle_actors.put_nowait(actor)
 
+    def _finalize_encode(self, actor: ray.ObjectRef,
+                         original_actor: ray.ObjectRef, actor_is_alive: bool):
+        assert self._idle_actors is not None
+        # Cleanup the dead actor.
+        if not actor_is_alive or original_actor is not actor:
+            self.tokenizer_actors.remove(original_actor)
+        if actor_is_alive:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+            # Add back the new actor.
+            if original_actor is not actor:
+                self.tokenizer_actors.append(actor)
+
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
@@ -88,23 +116,41 @@ def encode(self,
         The actor is then put back in the queue for future use.
         This is blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
         actor = self._idle_actors.get_nowait()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
                                     lora_request=lora_request))
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = ray.get(
+                    actor.encode.remote(request_id=request_id,
+                                        prompt=prompt,
+                                        lora_request=lora_request))
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     async def encode_async(
@@ -120,20 +166,37 @@ async def encode_async(
         The actor is then put back in the queue for future use.
         This is non-blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = await actor.encode.remote(request_id=request_id,
                                             prompt=prompt,
                                             lora_request=lora_request)
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = await actor.encode.remote(request_id=request_id,
+                                                prompt=prompt,
+                                                lora_request=lora_request)
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     def get_max_input_len(self,
@@ -155,6 +218,11 @@ async def get_lora_tokenizer_async(
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
+    def check_health(self):
+        if self._exception:
+            raise RuntimeError(
+                "TokenizerGroupPool is unhealthy.") from self._exception
+
 
 def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
     """Copy over all current process environment variables to the runtime_env.
diff --git a/vllm/utils.py b/vllm/utils.py
index de5fa81eea229..7bcd7b2b2332d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -176,6 +176,15 @@ def is_cpu() -> bool:
         return False
 
 
+@lru_cache(maxsize=None)
+def is_openvino() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "openvino" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
 @lru_cache(maxsize=None)
 def is_neuron() -> bool:
     try:
@@ -382,6 +391,10 @@ def get_open_port() -> int:
 
 
 def update_environment_variables(envs: Dict[str, str]):
+    if is_hip() and "CUDA_VISIBLE_DEVICES" in envs:
+        # Propagate changes to CUDA_VISIBLE_DEVICES to
+        # ROCm's HIP_VISIBLE_DEVICES as well
+        envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"]
     for k, v in envs.items():
         if k in os.environ and os.environ[k] != v:
             logger.warning(
@@ -548,7 +561,7 @@ def is_pin_memory_available() -> bool:
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
-    elif is_cpu():
+    elif is_cpu() or is_openvino():
         return False
     return True
 
@@ -785,9 +798,14 @@ def _cuda_device_count_stateless(
 
     if not torch.cuda._is_compiled():
         return 0
-    # bypass _device_count_nvml() if rocm (not supported)
-    nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
-    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    if is_hip():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
+            torch.cuda, "_device_count_amdsmi")) else -1
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
     return r
 
 
@@ -801,10 +819,66 @@ def cuda_device_count_stateless() -> int:
 
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
-
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+
+try:
+    import pynvml
+except ImportError:
+    # For non-NV devices
+    pynvml = None
+
+
+def with_nvml_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if pynvml is not None:
+            pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            if pynvml is not None:
+                pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@with_nvml_context
+def is_full_nvlink(device_ids: List[int]) -> bool:
+    """
+    query if the set of gpus are fully connected by nvlink (1 hop)
+    """
+    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+    for i, handle in enumerate(handles):
+        for j, peer_handle in enumerate(handles):
+            if i < j:
+                try:
+                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                        return False
+                except pynvml.NVMLError as error:
+                    logger.error(
+                        "NVLink detection failed. This is normal if your"
+                        " machine has no NVLink equipped.",
+                        exc_info=error)
+                    return False
+    return True
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_device_capability_stateless(device_id: int = 0) -> Tuple[int, int]:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+
+
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):
 
@@ -828,7 +902,13 @@ def parse_args(self, args=None, namespace=None):
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
-                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = '--' + key[len('--'):].replace('_', '-')
+                    processed_args.append(f'{key}={value}')
+                else:
+                    processed_args.append('--' +
+                                          arg[len('--'):].replace('_', '-'))
             else:
                 processed_args.append(arg)
 
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d539f56937be1..b83cc6f095bf7 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -8,20 +9,64 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
 
 
-class CPUModelRunner:
+@dataclass(frozen=True)
+class CPUModelInput(ModelRunnerInputBase):
+    """
+    Used by the CPUModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+            cls: Type["CPUModelInput"],
+            tensor_dict: Dict[str, Any],
+            attn_backend: Optional["AttentionBackend"] = None
+    ) -> "CPUModelInput":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
 
     def __init__(
         self,
@@ -65,15 +110,9 @@ def __init__(
             self.block_size,
         )
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
@@ -123,13 +162,7 @@ def _prepare_prompt(
 
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data is not None:
-                # Process multi-modal data
-                if self.multi_modal_input_processor is None:
-                    raise ValueError(
-                        "Multi-modal inputs are only supported by "
-                        "vision language models.")
-
-                mm_kwargs = self.multi_modal_input_processor(mm_data)
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 for k, v in mm_kwargs.items():
                     multi_modal_kwargs_list[k].append(v)
 
@@ -270,94 +303,84 @@ def _prepare_decode(
             attn_metadata,
         )
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> CPUModelInput:
+        return CPUModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[Dict[str, torch.Tensor]]]:
+    ) -> CPUModelInput:
         multi_modal_kwargs = None
-        if self.is_driver_worker:
-            # NOTE: We assume that all sequences in the group are all prompts or
-            # all decodes.
-            is_prompt = seq_group_metadata_list[0].is_prompt
-            # Prepare input tensors.
-            if is_prompt:
-                (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_kwargs
-                 ) = self._prepare_prompt(seq_group_metadata_list)
-            else:
-                (input_tokens, input_positions,
-                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
-                seq_lens = []
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list,
-                seq_lens,
-                # query_lens is not needed if chunked prefill is not
-                # supported. Since CPU worker doesn't support chunked prefill
-                # just use seq_lens instead.
-                seq_lens,
-                self.device,
-                pin_memory=False)
-            # Broadcast the metadata.
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-            }
-            metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, attn_metadata, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                seq_data=None,
-                seq_lens=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                generators=None,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_kwargs)
+            (input_tokens, input_positions,
+             attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = []
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since CPU worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            pin_memory=False)
+        return CPUModelInput(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            sampling_metadata=sampling_metadata,
+            multi_modal_kwargs=multi_modal_kwargs,
+        )
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: CPUModelInput,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "CPU worker does not support multi-step execution.")
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
-        if self.vision_language_config and multi_modal_input is not None:
-            execute_model_kwargs.update(multi_modal_input)
+        if (self.vision_language_config
+                and model_input.multi_modal_kwargs is not None):
+            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3ee394f9912e9..30ee262c7a8b3 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -8,15 +8,15 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.worker.cpu_model_runner import CPUModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -110,7 +110,7 @@ def get_cache_block_size(
         return dtype_size * total
 
 
-class CPUWorker(LoraNotSupportedWorkerBase):
+class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a CPU socket.
 
     Each worker is associated with a single CPU socket. The worker is 
@@ -154,7 +154,7 @@ def __init__(
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.model_runner = CPUModelRunner(
+        self.model_runner: CPUModelRunner = CPUModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
@@ -255,55 +255,37 @@ def _init_cache_engine(self) -> None:
         for layer_cache in self.cpu_cache:
             layer_cache.fill_(0)
 
-    def cache_copy(
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return self.cpu_cache
+
+    def execute_worker(
         self,
-        blocks_to_copy: torch.Tensor,
+        worker_input: WorkerInput,
     ) -> None:
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine.copy(worker_input.blocks_to_copy)
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-
-        if execute_model_req is None:
-            seq_group_metadata_list = None
-        else:
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            num_seq_groups: int = len(seq_group_metadata_list)
-            assert execute_model_req is not None
-            blocks_to_copy = execute_model_req.blocks_to_copy
-            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-            assert len(execute_model_req.blocks_to_swap_in) == 0
-            assert len(execute_model_req.blocks_to_swap_out) == 0
-            data: Dict[str, Any] = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_copy": execute_model_req.blocks_to_copy,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_copy = data["blocks_to_copy"]
-
-        self.cache_copy(blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.cpu_cache)
-
-        # CPU worker only supports single-step execution.
-        return [output]
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        assert execute_model_req is not None
+        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
+        blocks_to_copy = execute_model_req.blocks_to_copy
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device="cpu",
+                                      dtype=torch.int64).view(-1, 2)
+        assert len(execute_model_req.blocks_to_swap_in) == 0
+        assert len(execute_model_req.blocks_to_swap_out) == 0
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_copy=blocks_to_copy,
+        )
 
     def init_distributed_environment(self) -> None:
         """Initialize the distributed environment."""
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 465130d10e2f9..272917c7272df 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -1,24 +1,32 @@
-from typing import Dict, List, Optional, Set, Tuple
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 
-from vllm.attention import AttentionMetadata
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.model_runner import ModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU
 
 logger = init_logger(__name__)
 
 
-class EmbeddingModelRunner(ModelRunner):
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
+    """
+    Used by the EmbeddingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class EmbeddingModelRunner(
+        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
+        ModelInputForGPUWithPoolingMetadata)
 
     def __init__(
         self,
@@ -47,21 +55,27 @@ def __init__(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        model_input: ModelInputForGPUWithPoolingMetadata,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[PoolerOutput]:
-        (input_tokens, input_positions, attn_metadata, pooling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        num_steps: int = 1,
+    ) -> Optional[List[PoolerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "EmbeddingModelRunner does not support multi-step execution.")
 
         if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
 
         # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
         if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[graph_batch_size]
         else:
             model_executable = self.model
@@ -70,80 +84,48 @@ def execute_model(
         kv_caches = [None] * num_layers
 
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
         if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+            execute_model_kwargs.update({"image_input": multi_modal_kwargs})
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
         if not self.is_driver_worker:
-            return None
-
-        return self.model.pooler(hidden_states=hidden_states,
-                                 pooling_metadata=pooling_metadata)
+            return []
+
+        return [
+            self.model.pooler(hidden_states=hidden_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForGPUWithPoolingMetadata:
+        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
 
-    def prepare_input_tensors(
+    def prepare_model_input(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                _,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            # Prepare PoolingMetadata
-            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                     seq_lens)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            pooling_metadata = PoolingMetadata(seq_groups=None,
-                                               seq_data=None,
-                                               prompt_lens=None)
-
-        return (input_tokens, input_positions, attn_metadata, pooling_metadata,
-                lora_requests, lora_mapping, multi_modal_kwargs)
+    ) -> ModelInputForGPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
 
     def _prepare_pooling(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ff8df9b3ea56e..9b7bffd4ae06f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,21 +1,32 @@
-# This file has been modified by Neural Magic
-
+import dataclasses
 import gc
 import time
 import warnings
 from collections import defaultdict
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
 import torch.nn as nn
 
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+except ImportError:
+    BatchDecodeWithPagedKVCacheWrapper = None
+    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import graph_capture
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -23,11 +34,21 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models.interfaces import supports_lora
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -41,40 +62,90 @@
 ]
 _NUM_WARMUP_ITERS = 2
 
+TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
 
-class ModelInput(NamedTuple):
-    input_tokens: torch.Tensor
-    input_positions: torch.Tensor
-    attn_metadata: Optional[AttentionMetadata]
-    seq_lens: List[int]
-    query_lens: List[int]
-    lora_mapping: Optional[LoRAMapping]
-    lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, torch.Tensor]
-    slot_mapping: torch.Tensor
-    num_prefill_tokens: int
-    num_decode_tokens: int
-    num_prefills: int
 
-    @classmethod
-    def empty(cls, device):
-        return ModelInput(
-            input_tokens=torch.empty(0, device=device),
-            input_positions=torch.empty(0, device=device),
-            attn_metadata=None,
-            seq_lens=[],
-            query_lens=[],
-            lora_mapping=None,
-            lora_requests=set(),
-            multi_modal_kwargs={},
-            slot_mapping=torch.empty(0, device=device),
-            num_prefill_tokens=0,
-            num_decode_tokens=0,
-            num_prefills=0,
-        )
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
 
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForGPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForGPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
 
-class ModelRunner:
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForGPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForGPU]
 
     def __init__(
         self,
@@ -132,23 +203,20 @@ def __init__(
             self.block_size,
         ) if num_attn_heads else None
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
-        # Set if the backend is flashinfer.
-        self.flashinfer_workspace_buffer: torch.Tensor
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
 
+        self.flashinfer_decode_workspace_buffer = None
+        self.flashinfer_decode_wrapper = None
+        self.flashinfer_prefill_workspace_buffer = None
+        self.flashinfer_prefill_wrapper = None
+
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
             self.model = get_model(
@@ -167,14 +235,8 @@ def load_model(self) -> None:
                     self.model_memory_usage / float(2**30))
 
         if self.lora_config:
-            assert hasattr(self.model, "supported_lora_modules"
-                           ) and self.model.supported_lora_modules, (
-                               "Model does not support LoRA")
-            assert hasattr(
-                self.model,
-                "embedding_modules"), "Model does not have embedding_modules"
-            assert hasattr(self.model, "embedding_padding_modules"
-                           ), "Model does not have embedding_padding_modules"
+            assert supports_lora(self.model), "Model does not support LoRA"
+
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens,
@@ -243,11 +305,13 @@ def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
 
-    def _prepare_model_input(
+    def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInput:
-        """Prepare the model input based on a given sequence group.
+    ) -> TModelInputForGPU:
+        """Helper method to prepare the model input based on a given sequence
+        group. Prepares metadata needed for the base model forward pass but not
+        metadata for possible additional steps, e.g., sampling.
 
         The API assumes seq_group_metadata_list is sorted by prefill -> decode.
 
@@ -298,7 +362,7 @@ def _prepare_model_input(
         paged_kv_last_page_len: List[int] = []
 
         if len(seq_group_metadata_list) == 0:
-            return ModelInput.empty(self.device)
+            return self._model_input_cls()
 
         if self.sliding_window is not None:
             sliding_window_blocks = (self.sliding_window + self.block_size -
@@ -400,15 +464,6 @@ def _prepare_model_input(
                         if curr_sliding_window_blocks is not None:
                             block_table = block_table[
                                 -curr_sliding_window_blocks:]
-                        if self.attn_backend.get_name() == "flashinfer":
-                            paged_kv_indices.extend(block_table)
-                            paged_kv_indptr.append(paged_kv_indptr[-1] +
-                                                   len(block_table))
-                            last_page_len = seq_data.get_len(
-                            ) % self.block_size
-                            if last_page_len == 0:
-                                last_page_len = self.block_size
-                            paged_kv_last_page_len.append(last_page_len)
                     else:
                         # Only happens when memory profiling runs.
                         block_table = []
@@ -451,16 +506,13 @@ def _prepare_model_input(
                 mm_data = seq_group_metadata.multi_modal_data
                 if mm_data is not None:
                     # Process multi-modal data
-                    if self.multi_modal_input_processor is None:
-                        raise ValueError(
-                            "Multi-modal inputs are only supported by "
-                            "vision language models.")
-
-                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     for k, v in mm_kwargs.items():
                         multi_modal_kwargs_list[k].append(v)
 
-                if _is_block_tables_empty(seq_group_metadata.block_tables):
+                is_profile_run = _is_block_tables_empty(
+                    seq_group_metadata.block_tables)
+                if is_profile_run:
                     # During memory profiling, the block tables are not
                     # initialized yet. In this case, we just use a dummy
                     # slot mapping.
@@ -499,6 +551,27 @@ def _prepare_model_input(
                     slot = block_number * self.block_size + block_offset
                     slot_mapping.append(slot)
 
+                # Prepare input tensors for flashinfer
+                if self.attn_backend.get_name() == "flashinfer":
+                    seq_len = seq_data.get_len()
+                    # Get the number of valid blocks based on sequence length.
+                    # If seq_len = 16, block_size = 16,
+                    # block_table_bound is 1 with 1 valid block.
+                    # If seq_len = 15, block_size = 16,
+                    # block_table_bound is 0 + 1 with 1 valid block.
+                    block_table_bound = seq_len // self.block_size + 1 \
+                                        if seq_len % self.block_size != 0 \
+                                        else seq_len // self.block_size
+
+                    paged_kv_indices.extend(block_table[:block_table_bound])
+                    paged_kv_indptr.append(paged_kv_indptr[-1] +
+                                           block_table_bound)
+
+                    last_page_len = seq_len % self.block_size
+                    if last_page_len == 0:
+                        last_page_len = self.block_size
+                    paged_kv_last_page_len.append(last_page_len)
+
         batch_size = len(input_tokens)
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(prefill_seq_lens, default=0)
@@ -521,6 +594,12 @@ def _prepare_model_input(
                 seq_lens.append(1)
                 block_tables.append([])
                 lora_index_mapping.append(0)
+
+                if self.attn_backend.get_name() == "flashinfer":
+                    last_paged_kv_indptr = paged_kv_indptr[-1]
+                    paged_kv_indptr.append(last_paged_kv_indptr)
+                    paged_kv_last_page_len.append(0)
+
             batch_size = graph_batch_size
             num_decode_tokens = batch_size
 
@@ -544,9 +623,19 @@ def _prepare_model_input(
             )
         assert max_query_len > 0, ("query_lens: {}".format(query_lens))
 
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.int,
                                        device=self.device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=self.device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=self.device)
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=self.device)
@@ -555,6 +644,10 @@ def _prepare_model_input(
                      dim=0,
                      dtype=seq_start_loc.dtype,
                      out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
 
         input_tokens_tensor = torch.tensor(input_tokens,
                                            dtype=torch.long,
@@ -567,30 +660,30 @@ def _prepare_model_input(
                                            device=self.device)
 
         if self.attn_backend.get_name() == "flashinfer":
-            if not hasattr(self, "flashinfer_workspace_buffer"):
-                # Allocate 16MB workspace buffer
-                # Follow the example of flashinfer: https://docs.flashinfer.ai/api/python/decode.html
-                self.flashinfer_workspace_buffer = torch.empty(
-                    16 * 1024 * 1024, dtype=torch.uint8, device=self.device)
-            paged_kv_indptr_tensor = torch.tensor(paged_kv_indptr,
-                                                  dtype=torch.int,
-                                                  device=self.device)
-            paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
-                                                   dtype=torch.int,
-                                                   device=self.device)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                paged_kv_last_page_len, dtype=torch.int, device=self.device)
+            if len(paged_kv_indptr) > 0:
+                paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
+                                                       device='cpu',
+                                                       dtype=torch.int)
+                paged_kv_indptr_tensor = torch.tensor(paged_kv_indptr,
+                                                      device='cpu',
+                                                      dtype=torch.int)
+                paged_kv_last_page_len_tensor = torch.tensor(
+                    paged_kv_last_page_len, device='cpu', dtype=torch.int)
+            else:
+                paged_kv_indices_tensor = None
+                paged_kv_indptr_tensor = None
+                paged_kv_last_page_len_tensor = None
+
             kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
                                                       self.model_config.dtype)
+
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
                 num_prefill_tokens=num_prefill_tokens,
                 num_decode_tokens=num_decode_tokens,
-                use_cuda_graph=False,
                 max_prefill_seq_len=max_prefill_seq_len,
                 block_tables=block_tables,
-                workspace_buffer=self.flashinfer_workspace_buffer,
                 paged_kv_indptr=paged_kv_indptr_tensor,
                 paged_kv_indices=paged_kv_indices_tensor,
                 paged_kv_last_page_len=paged_kv_last_page_len_tensor,
@@ -599,25 +692,14 @@ def _prepare_model_input(
                 num_kv_heads=self.model_config.get_num_kv_heads(
                     self.parallel_config),
                 head_dim=self.model_config.get_head_size(),
-                page_size=16,
+                page_size=self.block_size,
                 seq_start_loc=seq_start_loc,
-                data_type=kv_cache_dtype)
-        else:
-            context_lens_tensor = torch.tensor(context_lens,
-                                               dtype=torch.int,
-                                               device=self.device)
-            query_lens_tensor = torch.tensor(query_lens,
-                                             dtype=torch.long,
-                                             device=self.device)
-            query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=query_start_loc.dtype,
-                         out=query_start_loc[1:])
+                query_start_loc=query_start_loc,
+                device=self.device,
+                data_type=kv_cache_dtype,
+                use_cuda_graph=use_captured_graph)
 
+        else:
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
@@ -648,7 +730,7 @@ def _prepare_model_input(
             for k, v in multi_modal_kwargs_list.items()
         }
 
-        return ModelInput(
+        return self._model_input_cls(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
             attn_metadata=attn_metadata,
@@ -657,132 +739,8 @@ def _prepare_model_input(
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
             multi_modal_kwargs=multi_modal_kwargs,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-        )
-
-    def prepare_input_tensors(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                query_lens,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, seq_lens, query_lens, self.device,
-                self.pin_memory)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                num_prompts=0,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_kwargs)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_kwargs
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
-        if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
-        else:
-            model_executable = self.model
-
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            **multi_modal_kwargs,
         )
 
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return None
-
-        # Sample the next token.
-        output: SamplerOutput = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
-        )
-
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            assert seq_group_metadata_list is not None
-            if seq_group_metadata_list[0].is_prompt:
-                hidden_states = hidden_states.index_select(
-                    0, sampling_metadata.selected_token_indices)
-            output.hidden_states = hidden_states
-
-        return output
-
     @torch.inference_mode()
     def profile_run(self) -> None:
         # Enable top-k sampling to reflect the accurate memory usage.
@@ -833,12 +791,9 @@ def profile_run(self) -> None:
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            if vlm_config is None:
-                seq_data = SequenceData([0] * seq_len)
-                dummy_multi_modal_data = None
-            else:
-                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
-                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+            assert len(seq_data.prompt_token_ids) == seq_len
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
@@ -855,7 +810,8 @@ def profile_run(self) -> None:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        model_input = self.prepare_model_input(seqs)
+        self.execute_model(model_input, kv_caches)
         torch.cuda.synchronize()
         return
 
@@ -935,27 +891,97 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
             bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
         ]
 
+        if self.attn_backend.get_name() == "flashinfer":
+            # For flashinfer, different batch sizes will share the
+            # same workspace buffer.
+            decode_workspace_buffer = \
+            torch.empty(FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                                                dtype=torch.uint8,
+                                              device=self.device)
+            indices_buffer = torch.empty(max_batch_size *
+                                         self.cache_config.num_gpu_blocks,
+                                         dtype=torch.int32,
+                                         device=self.device)
+            indptr_buffer = torch.empty(max_batch_size + 1,
+                                        dtype=torch.int32,
+                                        device=self.device)
+            last_page_len_buffer = torch.empty(max_batch_size,
+                                               dtype=torch.int32,
+                                               device=self.device)
+
         with graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for batch_size in reversed(batch_size_capture_list):
-                # Create dummy attn_metadata.
-                attn_metadata = self.attn_backend.make_metadata(
-                    num_prefills=0,
-                    num_prefill_tokens=0,
-                    num_decode_tokens=batch_size,
-                    slot_mapping=slot_mapping[:batch_size],
-                    seq_lens=None,
-                    seq_lens_tensor=seq_lens[:batch_size],
-                    max_query_len=None,
-                    max_prefill_seq_len=0,
-                    max_decode_seq_len=self.max_seq_len_to_capture,
-                    query_start_loc=None,
-                    seq_start_loc=None,
-                    context_lens_tensor=None,
-                    block_tables=block_tables[:batch_size],
-                    use_cuda_graph=True,
-                )
+                if self.attn_backend.get_name() == "flashinfer":
+                    indptr_buffer = indptr_buffer[:batch_size + 1]
+                    last_page_len_buffer = last_page_len_buffer[:batch_size]
+
+                    num_qo_heads = self.model_config.get_num_attention_heads(
+                        self.parallel_config)
+                    num_kv_heads = self.model_config.get_num_kv_heads(
+                        self.parallel_config)
+                    if num_qo_heads // num_kv_heads >= 4:
+                        use_tensor_cores = True
+                    else:
+                        use_tensor_cores = False
+                    decode_wrapper = \
+                        CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+                        decode_workspace_buffer, indptr_buffer, indices_buffer,
+                        last_page_len_buffer, "NHD", use_tensor_cores)
+                    kv_cache_dtype = get_kv_cache_torch_dtype(
+                        self.kv_cache_dtype, self.model_config.dtype)
+
+                    paged_kv_indptr_tensor_host = torch.arange(
+                        0, batch_size + 1, dtype=torch.int32)
+                    paged_kv_indices_tensor_host = torch.arange(
+                        0, batch_size, dtype=torch.int32)
+                    paged_kv_last_page_len_tensor_host = torch.full(
+                        (batch_size, ), self.block_size, dtype=torch.int32)
+                    query_start_loc_host = torch.arange(0,
+                                                        batch_size + 1,
+                                                        dtype=torch.int32)
+
+                    attn_metadata = self.attn_backend.make_metadata(
+                        num_prefills=0,
+                        slot_mapping=slot_mapping[:batch_size],
+                        num_prefill_tokens=0,
+                        num_decode_tokens=batch_size,
+                        max_prefill_seq_len=0,
+                        block_tables=block_tables,
+                        paged_kv_indptr=paged_kv_indptr_tensor_host,
+                        paged_kv_indices=paged_kv_indices_tensor_host,
+                        paged_kv_last_page_len=
+                        paged_kv_last_page_len_tensor_host,
+                        num_qo_heads=num_qo_heads,
+                        num_kv_heads=num_kv_heads,
+                        head_dim=self.model_config.get_head_size(),
+                        page_size=self.block_size,
+                        seq_start_loc=None,
+                        query_start_loc=query_start_loc_host,
+                        device=self.device,
+                        data_type=kv_cache_dtype,
+                        use_cuda_graph=True,
+                        decode_wrapper=decode_wrapper,
+                        prefill_wrapper=None)
+                    attn_metadata.begin_forward()
+                else:
+                    attn_metadata = self.attn_backend.make_metadata(
+                        num_prefills=0,
+                        num_prefill_tokens=0,
+                        num_decode_tokens=batch_size,
+                        slot_mapping=slot_mapping[:batch_size],
+                        seq_lens=None,
+                        seq_lens_tensor=seq_lens[:batch_size],
+                        max_query_len=None,
+                        max_prefill_seq_len=0,
+                        max_decode_seq_len=self.max_seq_len_to_capture,
+                        query_start_loc=None,
+                        seq_start_loc=None,
+                        context_lens_tensor=None,
+                        block_tables=block_tables[:batch_size],
+                        use_cuda_graph=True,
+                    )
 
                 if self.lora_config:
                     lora_mapping = LoRAMapping(
@@ -964,8 +990,20 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                     )
                     self.set_active_loras(set(), lora_mapping)
 
-                graph_runner = CUDAGraphRunner(self.model)
-                hidden_states = graph_runner.capture(
+                graph_runner = CUDAGraphRunner(self.model,
+                                               self.attn_backend.get_name())
+
+                if self.attn_backend.get_name() == "flashinfer":
+                    graph_runner.flashinfer_indptr_buffer = indptr_buffer
+                    graph_runner.flashinfer_indices_buffer = indices_buffer
+                    graph_runner.flashinfer_last_page_len_buffer = \
+                        last_page_len_buffer
+                    graph_runner.flashinfer_decode_workspace_buffer = \
+                            decode_workspace_buffer
+                    graph_runner.flashinfer_decode_wrapper = \
+                        decode_wrapper
+
+                graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
                     hidden_states[:batch_size]
@@ -988,19 +1026,170 @@ def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
 
+class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
+        ModelInputForGPUWithSamplingMetadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        model_input = \
+            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            )
+        return model_input
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     model_input.seq_lens,
+                                                     model_input.query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError("num_steps > 1 is not supported in ModelRunner")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        if self.attn_backend.get_name() == "flashinfer":
+            assert model_input.attn_metadata is not None
+            assert model_input.input_tokens is not None
+            if self.flashinfer_decode_workspace_buffer is None:
+                self.flashinfer_decode_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_decode_wrapper = \
+                    BatchDecodeWithPagedKVCacheWrapper(
+                    self.flashinfer_decode_workspace_buffer, "NHD")
+                self.flashinfer_prefill_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_prefill_wrapper = \
+                    BatchPrefillWithPagedKVCacheWrapper(
+                    self.flashinfer_prefill_workspace_buffer, "NHD")
+
+            model_input.attn_metadata.prefill_wrapper = \
+                self.flashinfer_prefill_wrapper
+            if model_input.attn_metadata.use_cuda_graph:
+                batch_size = model_input.input_tokens.shape[0]
+                model_input.attn_metadata.decode_wrapper = self.graph_runners[
+                    batch_size].flashinfer_decode_wrapper
+            else:
+                model_input.attn_metadata.decode_wrapper = \
+                    self.flashinfer_decode_wrapper
+            model_input.attn_metadata.begin_forward()
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
+            model_executable = self.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        hidden_states = model_executable(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            **multi_modal_kwargs,
+        )
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        # Sample the next token.
+        output: SamplerOutput = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            assert model_input.sampling_metadata is not None
+            indices = model_input.sampling_metadata.selected_token_indices
+            if model_input.is_prompt:
+                hidden_states = hidden_states.index_select(0, indices)
+            elif decode_meta.use_cuda_graph:
+                hidden_states = hidden_states[:len(indices)]
+
+            output.hidden_states = hidden_states
+
+        return [output]
+
+
 # NOTE: this is nn.Module so the profiler can properly capture/group
 #  kernels calls made within the graph
 class CUDAGraphRunner(nn.Module):
 
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, backend_name: str):
         super().__init__()
 
         self.model = model
+        self.backend_name = backend_name
+
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
 
         self._graph: Optional[torch.cuda.CUDAGraph] = None
 
+        self.flashinfer_decode_workspace_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indptr_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indices_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_last_page_len_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_decode_wrapper: Optional[
+            CUDAGraphBatchDecodeWithPagedKVCacheWrapper] = None
+
     @property
     def graph(self):
         assert self._graph is not None
@@ -1053,14 +1242,23 @@ def capture(
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
-        self.input_buffers = {
-            "input_ids": input_ids,
-            "positions": positions,
-            "kv_caches": kv_caches,
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-        }
+        if self.backend_name == "flashinfer":
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+            }
+        else:
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+                "seq_lens_tensor":
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                "block_tables": attn_metadata.decode_metadata.block_tables,
+            }
         self.output_buffers = {"hidden_states": hidden_states}
         return hidden_states
 
@@ -1080,10 +1278,12 @@ def forward(
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
         self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                  non_blocking=True)
-        self.input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        self.input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if self.backend_name != "flashinfer":
+            self.input_buffers["seq_lens_tensor"].copy_(
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                non_blocking=True)
+            self.input_buffers["block_tables"].copy_(
+                attn_metadata.decode_metadata.block_tables, non_blocking=True)
         # Run the graph.
         self.graph.replay()
 
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
new file mode 100644
index 0000000000000..959cfc0b9cac5
--- /dev/null
+++ b/vllm/worker/model_runner_base.py
@@ -0,0 +1,158 @@
+import dataclasses
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
+                    TypeVar)
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend
+    from vllm.model_executor import SamplingMetadata
+
+T = TypeVar('T', bound="ModelRunnerInputBase")
+
+
+def _add_attn_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        attn_metadata: Optional["AttentionMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    AttentionMetadata fields.
+    """
+    if attn_metadata is not None:
+        tensor_dict.update(attn_metadata.asdict_zerocopy())
+
+
+def _init_attn_metadata_from_tensor_dict(
+    attn_backend: "AttentionBackend",
+    tensor_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Helper method to initialize AttentionMetadata based on an
+    AttentionBackend and broadcastable AttentionMetadata fields.
+    """
+    # Extract the fields used to create AttentionMetadata.
+    valid_attn_kwargs = {}
+    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
+        val = tensor_dict.pop(field.name, None)
+        if val is not None:
+            valid_attn_kwargs[field.name] = val
+
+    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
+    tensor_dict["attn_metadata"] = attn_metadata
+    return tensor_dict
+
+
+def _init_sampling_metadata_from_tensor_dict(  # type: ignore
+        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Helper method to initialize SamplingMetadata based on broadcastable
+    SamplingMetadata fields.
+    """
+    from vllm.model_executor import SamplingMetadata
+
+    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
+    # An empty SamplingMetadata to signal that the worker should skip
+    # sampling.
+    if selected_token_indices is not None:
+        tensor_dict["sampling_metadata"] = SamplingMetadata(
+            seq_groups=None,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=None,
+            num_prompts=0,
+        )
+    return tensor_dict
+
+
+def _add_sampling_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        sampling_metadata: Optional["SamplingMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    SamplingMetadata fields.
+    """
+    if sampling_metadata is not None:
+        tensor_dict["selected_token_indices"] = (
+            sampling_metadata.selected_token_indices)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelRunnerInputBase(ABC):
+    """Local inputs to each worker's model runner. May contain
+    device-specific data. Different worker backends may have different methods
+    of converting from the global ExecuteModelRequest produced by the LLM
+    engine to the worker-local ModelRunnerInputBase objects.
+
+    Model runners that support multi-GPU execution should define a
+    ModelRunnerInputBase subclass, add their required fields, and specify how to
+    serialize/deserialize a ModelInput for broadcast between workers.
+    """
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        """
+        Extract broadcastable fields. Override for fields that require some
+        custom deserialization.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[T],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> T:
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        ModelRunnerInputBase.
+        """
+        raise NotImplementedError
+
+
+class ModelRunnerBase(ABC, Generic[T]):
+    """
+    Model runner interface that abstracts a particular hardware and/or type of
+    model. Model execution may communicate data with model runners in other
+    processes, but it should not include control plane metadata communication.
+
+    Each ModelRunnerBase subclass should define a corresponding
+    ModelRunnerInputBase subclass.
+    """
+
+    @abstractmethod
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> T:
+        """
+        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
+        dict.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> T:
+        """
+        Prepare the inputs to ModelRunnerBase.execute_model from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: T,
+        kv_caches: Optional[List[torch.Tensor]],
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute the model on the given input.
+        """
+        raise NotImplementedError
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a336be04e124f..2ccf4a50a87bd 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -10,11 +11,39 @@
 from vllm.model_executor.model_loader.neuron import get_neuron_model
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 
-class NeuronModelRunner:
+@dataclass(frozen=True)
+class ModelInputForNeuron(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    input_block_ids: Optional[torch.Tensor] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        raise NotImplementedError("ModelInputForNeuron cannot be broadcast.")
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForNeuron":
+        assert attn_backend is None
+        return cls.from_broadcasted_tensor_dict(tensor_dict)
+
+
+class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
     def __init__(
         self,
@@ -139,10 +168,14 @@ def _prepare_decode(
 
         return input_tokens, input_positions, input_block_ids
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
+        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
+    ) -> ModelInputForNeuron:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
         is_prompt = seq_group_metadata_list[0].is_prompt
@@ -164,32 +197,38 @@ def prepare_input_tensors(
             self.device,
             self.pin_memory)
 
-        return (input_tokens, input_positions, input_block_ids,
-                sampling_metadata)
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata)
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, input_block_ids, sampling_metadata
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
 
         hidden_states = self.model(
-            input_ids=input_tokens,
-            positions=input_positions,
-            input_block_ids=input_block_ids,
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
         )
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     @property
     def vocab_size(self) -> int:
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index d0e6aaed180e6..307c107ddef71 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
 """A Neuron worker class."""
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -7,12 +7,13 @@
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
 from vllm.model_executor import set_random_seed
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 
-class NeuronWorker(LoraNotSupportedWorkerBase):
+class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
@@ -34,8 +35,9 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = NeuronModelRunner(model_config, parallel_config,
-                                              scheduler_config, device_config)
+        self.model_runner: NeuronModelRunner = NeuronModelRunner(
+            model_config, parallel_config, scheduler_config, device_config)
+        self.is_driver_worker = True
 
     def init_device(self) -> None:
         # Set random seed.
@@ -73,22 +75,19 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> List[SamplerOutput]:
-        num_seq_groups = len(seq_group_metadata_list)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return False
 
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return None
 
-        output = self.model_runner.execute_model(seq_group_metadata_list)
-
-        # Neuron worker only supports single-step output. Wrap the output in a
-        # list to conform to interface.
-        return [output]
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        return WorkerInput(num_seq_groups=len(
+            execute_model_req.seq_group_metadata_list), )
 
     def get_cache_block_size_bytes(self) -> int:
         """Determine the size in bytes of a cache block.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
new file mode 100644
index 0000000000000..336eaf814fb3f
--- /dev/null
+++ b/vllm/worker/openvino_model_runner.py
@@ -0,0 +1,330 @@
+from typing import List, NamedTuple, Optional, Tuple
+
+import openvino as ov
+import torch
+from torch import nn
+
+from vllm.attention import get_attn_backend
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader.openvino import get_model
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+logger = init_logger(__name__)
+
+
+class ModelInput(NamedTuple):
+    input_tokens: torch.Tensor
+    input_positions: torch.Tensor
+    attn_metadata: Optional[OpenVINOAttentionMetadata]
+    seq_lens: List[int]
+    query_lens: List[int]
+    multi_modal_input: Optional[torch.Tensor]
+
+    @classmethod
+    def empty(cls, device):
+        return ModelInput(input_tokens=torch.empty(0, device=device),
+                          input_positions=torch.empty(0, device=device),
+                          attn_metadata=None,
+                          seq_lens=[],
+                          query_lens=[],
+                          multi_modal_input=None)
+
+
+class OpenVINOModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
+        self.load_config = load_config
+        self.is_driver_worker = is_driver_worker
+
+        self.device = self.device_config.device
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
+
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+
+    def load_model(self) -> None:
+        self.model = get_model(
+            model_config=self.model_config,
+            device_config=self.device_config,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+
+    def _prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInput:
+        """Prepare the model input based on a given sequence group.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        """
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+
+        seq_lens: List[int] = []
+        past_lens: List[int] = []
+        query_lens: List[int] = []
+        subsequence_begins: List[int] = []
+        block_indices: List[int] = []
+        block_indices_begins: List[int] = []
+
+        # initialize beginning of prefix sums
+        subsequence_begins.append(0)
+        block_indices_begins.append(0)
+
+        if len(seq_group_metadata_list) == 0:
+            return ModelInput.empty(self.device)
+
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_id in seq_ids:
+                computed_block_nums = seq_group_metadata.computed_block_nums
+                if (self.scheduler_config is not None
+                        and self.scheduler_config.chunked_prefill_enabled
+                        and not (computed_block_nums is None
+                                 or computed_block_nums == [])):
+                    raise RuntimeError(
+                        "chunked prefill cannot be used with prefix caching "
+                        "now.")
+
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                if is_prompt:
+                    computed_len = seq_data.get_num_computed_tokens()
+                else:
+                    # get_num_computed_tokens is incorrect for spec decoding.
+                    # So, we should have a special logic here.
+                    # TODO(sang): Fix it.
+                    computed_len = seq_data.get_len() - 1
+
+                seq_len = min(
+                    seq_data.get_len(),
+                    computed_len + seq_group_metadata.token_chunk_size,
+                )
+                if is_prompt:
+                    tokens = seq_data.get_token_ids()[computed_len:seq_len]
+                else:
+                    # Optimization. get_token_ids requires the entire copy of
+                    # tokens.
+                    tokens = [seq_data.get_last_token_id()]
+
+                # Prefix cache was hit.
+                # Prefix is not supported with sliding_window
+                prefix_cache_hit = (computed_block_nums is not None
+                                    and len(computed_block_nums) > 0
+                                    and self.sliding_window is None
+                                    and is_prompt)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                # TODO(sang): Combine chunked prefill and prefix caching by
+                # only allowing multiple of block_size chunk size.
+                # NOTE: This only works for oooooooxxx style attention.
+                if prefix_cache_hit:
+                    assert computed_block_nums is not None
+                    computed_len = len(computed_block_nums) * self.block_size
+                    tokens = tokens[computed_len:]
+                elif (self.scheduler_config.chunked_prefill_enabled
+                      or not is_prompt):
+                    if seq_group_metadata.block_tables is not None:
+                        # chunked prefill or decode
+                        block_table = seq_group_metadata.block_tables[seq_id]
+                        if self.sliding_window is not None:
+                            # chunked prefill doesn't support sliding window.
+                            assert not self.scheduler_config.chunked_prefill_enabled  # noqa: E501
+                            sliding_window_blocks = (self.sliding_window //
+                                                     self.block_size)
+                            block_table = block_table[-sliding_window_blocks:]
+                    else:
+                        # Only happens when memory profiling runs.
+                        block_table = []
+                else:
+                    # prompt phase w/o prefix_caching, chunked_prefill
+                    pass
+
+                block_indices.extend(block_table)
+                block_indices_begins.append(block_indices_begins[-1] +
+                                            len(block_table))
+
+                # TODO(sang): This is a hack to make sliding window work with
+                # paged attn. We can remove it if we make paged attn kernel
+                # to properly handle slinding window attn.
+                if self.sliding_window is not None and not is_prompt:
+                    seq_len = min(seq_len, self.sliding_window)
+                    computed_len = seq_len - 1
+
+                seq_lens.append(seq_len)
+
+                query_len = seq_len - computed_len
+                query_lens.append(query_len)
+
+                input_tokens.extend(tokens)
+                input_positions.extend(list(range(computed_len, seq_len)))
+
+                past_lens.append(computed_len)
+                subsequence_begins.append(subsequence_begins[-1] + query_len)
+
+                if is_prompt:
+                    assert len(seq_ids) == 1
+                else:
+                    assert (
+                        query_len == 1
+                    ), "seq_len: {}, computed_len: {}, query_len: {}".format(
+                        seq_len, computed_len, query_len)
+
+        max_query_len = max(query_lens)
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)  # type: ignore
+
+        past_lens_tensor = torch.tensor(past_lens,
+                                        dtype=torch.int32,
+                                        device=self.device)  # type: ignore
+        subsequence_begins_tensor = torch.tensor(
+            subsequence_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+        block_indices_tensor = torch.tensor(block_indices,
+                                            dtype=torch.int32,
+                                            device=self.device)  # type: ignore
+        block_indices_begins_tensor = torch.tensor(
+            block_indices_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        max_context_len = max(seq_lens)
+        max_context_len_tensor = torch.tensor(
+            max_context_len, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        attn_metadata = self.attn_backend.make_openvino_metadata(
+            past_lens=past_lens_tensor,
+            subsequence_begins=subsequence_begins_tensor,
+            block_indices=block_indices_tensor,
+            block_indices_begins=block_indices_begins_tensor,
+            max_context_len=max_context_len_tensor,
+        )
+        return ModelInput(
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            None,
+        )
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
+               SamplingMetadata, Optional[torch.Tensor], ]:
+        multi_modal_input = None
+
+        # Prepare input tensors.
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            multi_modal_input,
+        ) = self._prepare_model_input(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens,
+            self.device,
+            pin_memory=False,
+        )
+
+        return (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_input,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
+    ) -> Optional[SamplerOutput]:
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_input,
+        ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
new file mode 100644
index 0000000000000..7a462ce5d0b66
--- /dev/null
+++ b/vllm/worker/openvino_worker.py
@@ -0,0 +1,353 @@
+"""An OpenVINO worker class."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import openvino as ov
+import torch
+import torch.distributed
+
+from vllm.attention import get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import (broadcast_tensor_dict,
+                              ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.worker.openvino_model_runner import OpenVINOModelRunner
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+
+logger = init_logger(__name__)
+
+
+class OpenVINOCacheEngine:
+    """Manages the KV cache for OpenVINO backend.
+
+    This class is responsible for initializing and managing CPU KV
+    caches. It also provides methods for performing KV cache operations, such
+    as copying.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
+    ) -> None:
+        assert device_config.device_type == "openvino"
+        self.cache_config = cache_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+
+        self.head_size = model_config.get_head_size()
+        if device_config.device.type == "cpu" and \
+            cache_config.cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            self.head_size += 8
+        self.num_layers = model_config.get_num_layers(parallel_config)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+
+        self.block_size = cache_config.block_size
+        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
+        # for OpenVINO backend, because we want to reuse KV cache management
+        # in the scheduler.
+        self.num_cpu_blocks = cache_config.num_gpu_blocks
+
+        # Get attention backend.
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.head_size,
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+        )
+
+        # Initialize the cache.
+        self.kv_cache: List[Tuple[ov.Tensor,
+                                  ov.Tensor]] = self._allocate_kv_cache(
+                                      self.num_cpu_blocks)
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
+        """Allocates KV cache."""
+        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
+        kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
+        for _ in range(self.num_layers):
+            key_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                   k_block_shape)
+            value_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                     v_block_shape)
+            kv_cache.append((key_blocks, value_blocks))
+        return kv_cache
+
+    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+        self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
+
+    @staticmethod
+    def get_cache_block_size(
+        block_size: int,
+        cache_dtype: ov.Type,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+    ) -> int:
+        head_size = model_config.get_head_size()
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        num_layers = model_config.get_num_layers(parallel_config)
+
+        if cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            head_size += 8
+
+        key_cache_block = block_size * num_kv_heads * head_size
+        value_cache_block = key_cache_block
+        total = num_layers * (key_cache_block + value_cache_block)
+        dtype_size = cache_dtype.size
+        return dtype_size * total
+
+
+class OpenVINOWorker(LoraNotSupportedWorkerBase):
+    """A worker class that executes the model on OpenVINO backend.
+
+    Each worker is associated with a single OpenVINO device. The worker is
+    responsible for maintaining the KV cache and executing the model on the
+    OpenVINO backend.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+
+            init_cached_hf_modules()
+        self.model_runner = OpenVINOModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: OpenVINOCacheEngine
+        self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
+
+    def init_device(self) -> None:
+        self.init_distributed_environment()
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of blocks available for the KV cache.
+
+        This determines how many KV blocks can fit into the configured
+        KV cache space.
+
+        Note that since vLLM assumes a block resides on GPU if it can be
+        modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
+        This allows us to reuse the scheduler of vLLM without generalizing it
+        to different devices.
+        """
+        # For OpenVINO backend, the block number will be calculated based on the
+        # openvino_kvcache_space_bytes.
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_cpu_blocks = int(self.cache_config.openvino_kvcache_space_bytes //
+                             cache_block_size)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_gpu_blocks = num_cpu_blocks
+        num_cpu_blocks = 0
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache. Currently, swappable CPU memory is not
+        supported.
+
+        Since this worker does not support GPUs, we use the num_gpu_blocks to
+        determine how many non-swappable CPU blocks to allocate.
+        """
+        assert (num_cpu_blocks == 0
+                ), f"{type(self)} does not support swappable cache"
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_cpu_blocks = num_gpu_blocks
+
+        self._validate_num_cpu_blocks(num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_cpu_blocks
+        self.cache_config.num_cpu_blocks = 0
+
+        # Initialize the cache.
+        self._init_cache_engine()
+
+    def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
+        """Raise errors if the num_cpu_blocks is invalid."""
+        if num_cpu_blocks <= 0:
+            raise ValueError(
+                "No available memory for the cache blocks. "
+                "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
+                "initializing the engine.")
+
+        max_seq_len = self.cache_config.block_size * num_cpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
+                "when initializing the engine.")
+
+    def _init_cache_engine(self) -> None:
+        self.cache_engine = OpenVINOCacheEngine(
+            self.cache_config,
+            self.model_config,
+            self.parallel_config,
+            self.device_config,
+        )
+        self.kv_cache = self.cache_engine.kv_cache
+        self.model_runner.block_size = self.cache_engine.block_size
+
+        assert self.kv_cache is not None
+
+        # Populate the cache to warmup the memory
+        for key_cache, value_cache in self.kv_cache:
+            key_cache.data[:] = 0
+            value_cache.data[:] = 0
+
+    def cache_copy(
+        self,
+        blocks_to_copy: List[Tuple[int, int]],
+    ) -> None:
+        self.cache_engine.copy(blocks_to_copy)  # type: ignore
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups: int = len(seq_group_metadata_list)
+            assert execute_model_req is not None
+            blocks_to_copy = execute_model_req.blocks_to_copy
+            assert len(execute_model_req.blocks_to_swap_in) == 0
+            assert len(execute_model_req.blocks_to_swap_out) == 0
+            data: Dict[str, Any] = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_copy": execute_model_req.blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_copy = data["blocks_to_copy"]
+
+        self.cache_copy(blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return []
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.kv_cache)
+
+        # OpenVINO worker only supports single-step execution.
+        return [output]
+
+    def init_distributed_environment(self) -> None:
+        """Initialize the distributed environment."""
+
+        parallel_config = self.parallel_config
+        rank = self.rank
+        distributed_init_method = self.distributed_init_method
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            backend="gloo",
+        )
+
+        # A small all_reduce for warmup.
+        torch.distributed.all_reduce(torch.zeros(1).cpu())
+
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size,
+        )
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size in bytes of a single KV cache block."""
+        return OpenVINOCacheEngine.get_cache_block_size(
+            self.cache_config.block_size,
+            self.cache_config.cache_dtype,
+            self.model_config,
+            self.parallel_config,
+        )
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 5003d3b0ca440..dd08536efc5fb 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -19,7 +19,12 @@
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = 0  # FIXME(woosuk)
+_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
+# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
+_ENABLE_TOP_P = False
+# FIXME(woosuk): A temporary hack to support `n > 1`.
+# This can significantly affect the performance if too large.
+_MAX_NUM_SAMPLES = 128
 
 
 class TPUModelRunner:
@@ -33,6 +38,7 @@ def __init__(
         cache_config: CacheConfig,
         load_config: LoadConfig,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        is_driver_worker: bool = False,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -41,6 +47,7 @@ def __init__(
         self.cache_config = cache_config
         self.load_config = load_config
         self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
         self.max_num_blocks_per_seq = (self.model_config.max_model_len //
@@ -139,8 +146,9 @@ def _dummy_run(
         p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
 
         # Dummy run.
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
         self.model(token_ids, position_ids, kv_caches, attn_metadata,
-                   input_lens, t, p)
+                   input_lens, t, p, num_samples)
 
     def warmup_model(
         self,
@@ -264,14 +272,11 @@ def _prepare_decode(
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         context_lens: List[int] = []
-        num_seq_groups = len(seq_group_metadata_list)
-        batch_size = _get_padded_batch_size(num_seq_groups)
 
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+        batch_idx = 0
+        for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
-
             seq_ids = list(seq_group_metadata.seq_data.keys())
-
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
                 generation_token = seq_data.get_last_token_id()
@@ -284,14 +289,16 @@ def _prepare_decode(
 
                 assert seq_group_metadata.block_tables is not None
                 block_table = seq_group_metadata.block_tables[seq_id]
-                self.block_tables[i, :len(block_table)] = block_table
+                self.block_tables[batch_idx, :len(block_table)] = block_table
+                batch_idx += 1
 
                 block_number = block_table[position // self.block_size]
                 block_offset = position % self.block_size
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
 
-        num_paddings = batch_size - num_seq_groups
+        batch_size = _get_padded_batch_size(batch_idx)
+        num_paddings = batch_size - batch_idx
         input_tokens = input_tokens + [[0]] * num_paddings
         input_positions = input_positions + [[0]] * num_paddings
         slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
@@ -329,63 +336,106 @@ def _prepare_sample(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         padded_batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
         assert len(seq_group_metadata_list) > 0
         t = []
         p = []
+        best_of = []
         for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.sampling_params is not None
             sampling_params = seq_group_metadata.sampling_params
-
+            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
+            # low temperature. This is not accurate.
             t.append(sampling_params.temperature
                      if sampling_params.temperature >= 1e-5 else 1e-5)
+            if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
+                raise NotImplementedError(
+                    "Top-p sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
             p.append(sampling_params.top_p)
-        num_paddings = padded_batch_size - len(seq_group_metadata_list)
+            if sampling_params.top_k != -1:
+                raise NotImplementedError(
+                    "Top-k sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
+            if sampling_params.best_of > _MAX_NUM_SAMPLES:
+                raise NotImplementedError(
+                    f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
+                    "backend.")
+            best_of.append(sampling_params.best_of)
+            if sampling_params.use_beam_search:
+                raise NotImplementedError(
+                    "Beam search is not supported by the TPU backend.")
+            if sampling_params.logprobs is not None:
+                raise NotImplementedError(
+                    "logprobs is not currently supported by the TPU backend.")
+            if sampling_params.prompt_logprobs is not None:
+                raise NotImplementedError(
+                    "prompt_logprobs is not currently supported by the TPU "
+                    "backend.")
+
+            # Repeat the sampling params if the seq group has multiple seqs.
+            num_seqs = len(seq_group_metadata.seq_data)
+            t += [t[-1]] * (num_seqs - 1)
+            p += [p[-1]] * (num_seqs - 1)
+            best_of += [best_of[-1]] * (num_seqs - 1)
+
+        num_paddings = padded_batch_size - len(t)
         t += [1.0] * num_paddings
         p += [1.0] * num_paddings
 
         t = torch.tensor(t, dtype=torch.float32, device=self.device)
         p = torch.tensor(p, dtype=torch.float32, device=self.device)
-        return t, p
+        return t, p, best_of
 
-    def prepare_inputs(
+    def _execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ):
-        assert seq_group_metadata_list is not None
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> List[CompletionSequenceGroupOutput]:
+        # Prepare inputs.
         assert len(seq_group_metadata_list) > 0
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
-        if seq_group_metadata_list[0].is_prompt:
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        if is_prompt:
             inputs = self._prepare_prompt(seq_group_metadata_list)
         else:
             inputs = self._prepare_decode(seq_group_metadata_list)
         padded_batch_size = inputs[0].shape[0]
-        sample_inputs = self._prepare_sample(seq_group_metadata_list,
+        t, p, best_of = self._prepare_sample(seq_group_metadata_list,
                                              padded_batch_size)
-        return inputs + sample_inputs
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
 
-    def _execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> List[CompletionSequenceGroupOutput]:
-        inputs = self.prepare_inputs(seq_group_metadata_list)
+        # Execute the model.
         next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
-                                    *inputs[2:])
+                                    *inputs[2:], t, p, num_samples)
+        # Retrieve the outputs to CPU.
         next_token_ids = next_token_ids.cpu().tolist()
 
-        i = 0
+        # NOTE(woosuk): Minimal code to construct the sampler outputs.
+        # The TPU backend does not reuse the sampler, since the TPU backend
+        # does not support the advanced sampling parameters such as logprobs.
+        zero_logprob = Logprob(0.0)
+        batch_idx = 0
         sampler_outputs = []
         for seq_group_metadata in seq_group_metadata_list:
             seq_outputs = []
             seq_ids = list(seq_group_metadata.seq_data.keys())
-            for seq_id in seq_ids:
-                next_token_id = next_token_ids[i]
-                seq_outputs.append(
-                    SequenceOutput(seq_id, next_token_id,
-                                   {next_token_id: Logprob(0.0)}))
-                i += 1
+            if is_prompt:
+                assert len(seq_ids) == 1
+                seq_id = seq_ids[0]
+                for i in range(best_of[batch_idx]):
+                    next_token_id = next_token_ids[batch_idx][i]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                batch_idx += 1
+            else:
+                for seq_id in seq_ids:
+                    next_token_id = next_token_ids[batch_idx][0]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                    batch_idx += 1
             sampler_outputs.append(
                 CompletionSequenceGroupOutput(seq_outputs, None))
         return sampler_outputs
@@ -394,8 +444,14 @@ def execute_model(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> SamplerOutput:
+        num_steps: int = 1,
+    ) -> List[SamplerOutput]:
+        if num_steps > 1:
+            raise ValueError(
+                "TPUModelRunner does not support multi-step execution.")
+
         assert seq_group_metadata_list is not None
+        assert len(seq_group_metadata_list) > 0
         if seq_group_metadata_list[0].is_prompt:
             # NOTE(woosuk): To reduce the compilation time, we only compile the
             # prefill inputs with batch size 1. Because the scheduler is not
@@ -411,7 +467,7 @@ def execute_model(
         else:
             sampler_outputs = self._execute_model(seq_group_metadata_list,
                                                   kv_caches)
-        return SamplerOutput(sampler_outputs)
+        return [SamplerOutput(sampler_outputs)]
 
 
 class ModelWrapper(nn.Module):
@@ -429,6 +485,7 @@ def forward(
         input_lens: torch.Tensor,
         t: torch.Tensor,
         p: torch.Tensor,
+        num_samples: int,
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
@@ -488,11 +545,12 @@ def forward(
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
         logits = logits / t.unsqueeze(dim=1)
-        # FIXME(woosuk): Disabled top-p sampling since it's too slow.
-        # logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+        if _ENABLE_TOP_P:
+            logits = _apply_top_p(logits, p.unsqueeze(dim=1))
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        # FIXME(woosuk): best_of > 1 is not supported.
-        next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1)
+        next_token_ids = torch.multinomial(probs,
+                                           num_samples,
+                                           replacement=True)
         return next_token_ids
 
 
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 04576015dadbd..d58c7dc994184 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,8 +1,9 @@
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch_xla.core.xla_model as xm
+import torch_xla.experimental.dynamo_set_buffer_donor  # noqa: F401
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -34,6 +35,7 @@ def __init__(
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -45,6 +47,7 @@ def __init__(
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
 
         assert self.device_config.device_type == "tpu"
         if self.cache_config.cache_dtype == "auto":
@@ -53,10 +56,14 @@ def __init__(
             self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype]
 
-        self.model_runner = TPUModelRunner(model_config, parallel_config,
-                                           scheduler_config, device_config,
-                                           cache_config, load_config,
-                                           vision_language_config)
+        self.model_runner = TPUModelRunner(model_config,
+                                           parallel_config,
+                                           scheduler_config,
+                                           device_config,
+                                           cache_config,
+                                           load_config,
+                                           vision_language_config,
+                                           is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
@@ -111,18 +118,26 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
 
-        m = xm.get_memory_info(self.device)
-        program_size = 1024 * 1024 * 1024  # 1GB
-        free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0)
-        kv_cache_bytes = int(free_bytes *
-                             self.cache_config.gpu_memory_utilization)
-        kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype)
+        dtype_btyes = get_dtype_size(self.cache_dtype)
         block_size = self.cache_config.block_size
-        num_tpu_blocks = (kv_cache_bytes //
-                          (kv_cache_dtype_btyes * block_size * num_layers * 2 *
-                           head_size * num_kv_heads))
+        block_size_bytes = (dtype_btyes * block_size * num_layers * 2 *
+                            head_size * num_kv_heads)
+
+        # Calculate the TPU KV cache size based on profiling.
+        m = xm.get_memory_info(self.device)
+        total_memory_size = m["bytes_limit"]
+        usable_memory_size = int(total_memory_size *
+                                 self.cache_config.gpu_memory_utilization)
+        profiled = m["bytes_used"]  # Weights + intermediate activations.
+        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+        num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
         num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
-        return num_tpu_blocks, 0
+
+        # Calculate the CPU KV cache size based on the config.
+        num_cpu_blocks = (self.cache_config.swap_space_bytes //
+                          block_size_bytes)
+        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
+        return num_tpu_blocks, num_cpu_blocks
 
     def initialize_cache(
         self,
@@ -138,15 +153,23 @@ def initialize_cache(
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
 
-        self.tpu_cache = []
+        self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
         tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
             num_gpu_blocks, self.block_size, num_kv_heads, head_size)
+        cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
+            num_cpu_blocks, self.block_size, num_kv_heads, head_size)
         for _ in range(num_layers):
-            key_cache = torch.zeros(tpu_cache_shape,
-                                    dtype=dtype,
-                                    device=self.device)
-            value_cache = torch.zeros_like(key_cache)
-            self.tpu_cache.append((key_cache, value_cache))
+            tpu_k_cache = torch.zeros(tpu_cache_shape,
+                                      dtype=dtype,
+                                      device=self.device)
+            tpu_v_cache = torch.zeros_like(tpu_k_cache)
+            self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
+            cpu_k_cache = torch.zeros(cpu_cache_shape,
+                                      dtype=dtype,
+                                      device="cpu")
+            cpu_v_cache = torch.zeros_like(cpu_k_cache)
+            self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
         self._warmup_model()
 
     def _warmup_model(self) -> None:
@@ -175,24 +198,94 @@ def get_cache_block_size_bytes(self) -> int:
 
     def execute_model(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
+        execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> List[SamplerOutput]:
-        if execute_model_req is None:
+        if not self.is_driver_worker:
+            self._execute_model_non_driver()
             return []
-
+        assert execute_model_req is not None
+        # Issue cache operations.
+        self.cache_swap(
+            execute_model_req.blocks_to_swap_in,
+            execute_model_req.blocks_to_swap_out,
+            execute_model_req.blocks_to_copy,
+        )
+        # Run the model.
         seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
-        if num_seq_groups == 0:
-            return []
-
-        # Currently, TPUWorker does not support swapping.
-        # TODO(woosuk): Support block copying.
-        assert len(execute_model_req.blocks_to_swap_in) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_swap_out) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_copy) == 0
-
+        assert len(seq_group_metadata_list) > 0
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.tpu_cache)
-        return [output]
+        return output
+
+    def cache_swap(
+        self,
+        blocks_to_swap_in: List[Tuple[int, int]],
+        blocks_to_swap_out: List[Tuple[int, int]],
+        blocks_to_copy: List[Tuple[int, int]],
+    ) -> None:
+        attn_backend = self.model_runner.attn_backend
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+
+        if blocks_to_swap_in:
+            # Swap from CPU to TPU.
+            src_indices, dst_indices = _make_src_to_dst(
+                blocks_to_swap_in, "cpu", self.device)
+            for i in range(num_layers):
+                tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                k = cpu_k_cache[:, src_indices].to(self.device)
+                v = cpu_v_cache[:, src_indices].to(self.device)
+                _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache)
+
+        if blocks_to_swap_out:
+            # Swap from TPU to CPU.
+            src_indices, dst_indices = _make_src_to_dst(
+                blocks_to_swap_out, self.device, "cpu")
+            for i in range(num_layers):
+                tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices].cpu()
+                cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices].cpu()
+
+        if blocks_to_copy:
+            src_to_dst = _make_src_to_dst(blocks_to_copy, self.device,
+                                          self.device)
+            attn_backend.copy_blocks(self.tpu_cache, src_to_dst)
+
+    def start_worker_execution_loop(self) -> None:
+        while self._execute_model_non_driver():
+            pass
+
+    def _execute_model_non_driver(self) -> bool:
+        self.model_runner.execute_model(None, self.tpu_cache)
+        return True
+
+
+def _make_src_to_dst(
+    mapping: List[Tuple[int, int]],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    src_indices = [i for i, _ in mapping]
+    dst_indices = [i for _, i in mapping]
+    src_indices = torch.tensor(src_indices,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_indices = torch.tensor(dst_indices,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_indices, dst_indices
+
+
+@torch.compile(backend="openxla")
+def _insert_kv(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    indices: torch.Tensor,
+    tpu_k_cache: torch.Tensor,
+    tpu_v_cache: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True)
+    tpu_k_cache[:, indices] = k
+    tpu_v_cache[:, indices] = v
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c60764ef1bed8..cc27d06b511f5 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Type
 
 import torch
 import torch.distributed
@@ -9,21 +9,21 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig, VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import get_device_capability_stateless
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
-from vllm.worker.model_runner import ModelRunner
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 
-class Worker(WorkerBase):
+class Worker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a GPU.
 
     Each worker is associated with a single GPU. The worker is responsible for
@@ -46,6 +46,7 @@ def __init__(
         vision_language_config: Optional[VisionLanguageConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -78,9 +79,12 @@ def __init__(
               or (speculative_config.draft_model_config.hf_config.model_type !=
                   "mlp_speculator") else {"return_hidden_states": True}
 
-        ModelRunnerClass = (EmbeddingModelRunner if
-                            self.model_config.embedding_mode else ModelRunner)
-        self.model_runner = ModelRunnerClass(
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if model_runner_cls is not None:
+            ModelRunnerClass = model_runner_cls
+        elif self.model_config.embedding_mode:
+            ModelRunnerClass = EmbeddingModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             model_config,
             parallel_config,
             scheduler_config,
@@ -225,40 +229,18 @@ def _warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
-    def cache_swap(
-        self,
-        blocks_to_swap_in: torch.Tensor,
-        blocks_to_swap_out: torch.Tensor,
-        blocks_to_copy: torch.Tensor,
-    ) -> None:
-        # Issue cache operations.
-        if blocks_to_swap_in.numel() > 0:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-        if blocks_to_swap_out.numel() > 0:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return self.gpu_cache
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
-        if not self.is_driver_worker:
-            self._execute_model_non_driver()
-            return []
-
-        if execute_model_req is None:
-            # This signals that there's no more requests to process for now.
-            # All workers are running infinite loop with broadcast_tensor_dict,
-            # and it stops the loop when the driver broadcasts an empty input.
-            # Send an empty input to notify all other workers to stop their
-            # execution loop.
-            broadcast_tensor_dict({}, src=0)
-            return []
-
-        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
         # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
         # they contain parameters to launch cudamemcpyasync.
         blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
@@ -273,59 +255,26 @@ def execute_model(
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                       device=self.device,
                                       dtype=torch.int64).view(-1, 2)
-        data: Dict[str, Any] = {
-            "num_seq_groups": num_seq_groups,
-            "blocks_to_swap_in": blocks_to_swap_in,
-            "blocks_to_swap_out": blocks_to_swap_out,
-            "blocks_to_copy": blocks_to_copy,
-        }
-        broadcast_tensor_dict(data, src=0)
-
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.gpu_cache)
 
-        # Worker only supports single-step execution. Wrap the output in a list
-        # to conform to interface.
-        return [output]
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
 
     @torch.inference_mode()
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        while self._execute_model_non_driver():
-            pass
-
-    def _execute_model_non_driver(self) -> bool:
-        """Execute model in parallel worker.
-
-        Returns True iff there are remaining sequences to process.
-        """
-        assert not self.is_driver_worker
-        data = broadcast_tensor_dict(src=0)
-        if not data:
-            return False
-
-        num_seq_groups = data.get("num_seq_groups", 0)
-        blocks_to_swap_in = data.get("blocks_to_swap_in")
-        blocks_to_swap_out = data.get("blocks_to_swap_out")
-        blocks_to_copy = data.get("blocks_to_copy")
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return False
-
-        self.model_runner.execute_model(None, self.gpu_cache)
-        return True
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine.swap_in(worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine.swap_out(worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine.copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
@@ -374,7 +323,7 @@ def init_worker_distributed_environment(
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
     if torch_dtype == torch.bfloat16:
-        compute_capability = torch.cuda.get_device_capability()
+        compute_capability = get_device_capability_stateless()
         if compute_capability[0] < 8:
             gpu_name = torch.cuda.get_device_name()
             raise ValueError(
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index dc09718de4a32..d867e15bdf82d 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,20 +1,26 @@
+import dataclasses
 import importlib
 import os
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
+import torch
+
+from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (enable_trace_function_call_for_thread,
+from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
                         update_environment_variables)
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
 logger = init_logger(__name__)
 
 
 class WorkerBase(ABC):
     """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware.
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
     """
 
     @abstractmethod
@@ -46,13 +52,23 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
+    @torch.inference_mode()
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
+
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        while True:
+            output = self.execute_model(execute_model_req=None)
+            if output is None:
+                return None
+
     @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Executes at least one model step on the given sequences, unless no
-        sequences are provided."""
+    ) -> Optional[List[SamplerOutput]]:
         raise NotImplementedError
 
     @abstractmethod
@@ -98,6 +114,151 @@ def list_loras(self) -> Set[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
 
 
+@dataclasses.dataclass(frozen=True)
+class WorkerInput:
+    """Local inputs to each worker. May contain device-specific data. These
+    fields should be broadcastable to other workers.
+    """
+
+    num_seq_groups: Optional[int] = None
+    blocks_to_swap_in: Optional[torch.Tensor] = None
+    blocks_to_swap_out: Optional[torch.Tensor] = None
+    blocks_to_copy: Optional[torch.Tensor] = None
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["WorkerInput"],
+        tensor_dict: Dict[str, Any],
+    ) -> "WorkerInput":
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        WorkerInput.
+        """
+        return cls(
+            num_seq_groups=tensor_dict.pop("num_seq_groups"),
+            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
+            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
+            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+        )
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        """
+        Extract broadcastable fields.
+        """
+        tensor_dict = {
+            "num_seq_groups": self.num_seq_groups,
+            "blocks_to_swap_in": self.blocks_to_swap_in,
+            "blocks_to_swap_out": self.blocks_to_swap_out,
+            "blocks_to_copy": self.blocks_to_copy,
+        }
+
+        return tensor_dict
+
+
+class LocalOrDistributedWorkerBase(WorkerBase):
+    """
+    Partial implementation of WorkerBase that has a default `execute_model`
+    definition to perform metadata transfer between workers when in distributed
+    mode. Subclasses of this interface should use model runners that inherit
+    from ModelRunnerBase, and should only need to implement worker-local logic.
+    If custom control plane logic is needed to transfer metadata, or if the
+    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
+    """
+    is_driver_worker: bool
+    model_runner: ModelRunnerBase
+
+    @property
+    @abstractmethod
+    def do_metadata_broadcast(self) -> bool:
+        """
+        Used by the default `execute_model` to check whether broadcast is
+        needed to transfer request inputs from the driver worker to other
+        workers in the TP group. If WorkerBase subclass only supports
+        single-worker execution, then this method should return False.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        """
+        Get the kv cache to pass to the worker's model runner. Used by the
+        default `execute_model`. If the worker's model runner does not follow
+        the ModelRunnerBase interface, then inherit from WorkerBase instead.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        """
+        Prepare the inputs to WorkerBase.execute_worker from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        """
+        Process an execution request.
+        """
+        raise NotImplementedError
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            model_input: ModelRunnerInputBase = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list))
+            num_steps = execute_model_req.num_steps
+
+            if self.do_metadata_broadcast:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_data["num_steps"] = num_steps
+                broadcast_tensor_dict(broadcast_data, src=0)
+        else:
+            assert self.do_metadata_broadcast
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            num_steps = broadcast_data.pop("num_steps")
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        return self.model_runner.execute_model(model_input, self.kv_cache,
+                                               num_steps)
+
+
 class WorkerWrapperBase:
     """
     The whole point of this class is to lazily initialize the worker.
@@ -125,6 +286,14 @@ def update_environment_variables(envs: Dict[str, str]) -> None:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
             # suppress the warning in `update_environment_variables`
             del os.environ[key]
+            if is_hip():
+                hip_env_var = "HIP_VISIBLE_DEVICES"
+                if hip_env_var in os.environ:
+                    logger.warning(
+                        "Ignoring pre-set environment variable `%s=%s` as "
+                        "%s has also been set, which takes precedence.",
+                        hip_env_var, os.environ[hip_env_var], key)
+                os.environ.pop(hip_env_var, None)
         update_environment_variables(envs)
 
     def init_worker(self, *args, **kwargs):
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f30de703e805d..99fd7da5edda5 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -14,6 +15,15 @@
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -24,7 +34,42 @@
 ]
 
 
-class XPUModelRunner:
+@dataclass(frozen=True)
+class ModelInputForXPU(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_input: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["ModelInputForXPU"],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForXPU":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
     def __init__(
         self,
@@ -130,15 +175,22 @@ def profile_run(self) -> None:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        model_input = self.prepare_model_input(seqs)
+        self.execute_model(model_input, kv_caches)
         torch.xpu.synchronize()
         return
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU:
+        return (ModelInputForXPU.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        ))
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[torch.Tensor]]:
+    ) -> ModelInputForXPU:
         multi_modal_input = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
@@ -185,8 +237,11 @@ def prepare_input_tensors(
                 num_prompts=0,
             )
 
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_input)
+        return ModelInputForXPU(input_tokens=input_tokens,
+                                input_positions=input_positions,
+                                attn_metadata=attn_metadata,
+                                sampling_metadata=sampling_metadata,
+                                multi_modal_input=multi_modal_input)
 
     def _prepare_decode(
         self,
@@ -277,38 +332,41 @@ def _prepare_decode(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: ModelInputForXPU,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "XPUModelRunner does not support multi-step execution.")
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
         if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+            execute_model_kwargs.update(
+                {"image_input": model_input.multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     def _prepare_prompt(
         self,