diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
deleted file mode 100644
index 389a12956c3c3..0000000000000
--- a/.buildkite/download-images.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-
-# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
-mkdir -p images
-cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
-
-cd -
diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
new file mode 100644
index 0000000000000..15268395ec68b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
+model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.671
+  - name: "exact_match,flexible-extract"
+    value: 0.664
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
new file mode 100644
index 0000000000000..4397effa82cc8
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
+model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.905
+  - name: "exact_match,flexible-extract"
+    value: 0.905
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 0000000000000..fa6ea236ef04f
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
new file mode 100644
index 0000000000000..c513159c6fa0d
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.752
+  - name: "exact_match,flexible-extract"
+    value: 0.754
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
new file mode 100644
index 0000000000000..5e57fcbcf7d9b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..374171f1f915b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.755
+  - name: "exact_match,flexible-extract"
+    value: 0.755
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
new file mode 100644
index 0000000000000..dc36b705634f9
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..bc29002985969
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.728
+  - name: "exact_match,flexible-extract"
+    value: 0.728
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
new file mode 100644
index 0000000000000..3964f3be5e874
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.758
+  - name: "exact_match,flexible-extract"
+    value: 0.759
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 0000000000000..fb4b4915ab955
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
new file mode 100644
index 0000000000000..75a24e408e7ad
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
+model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.86
+  - name: "exact_match,flexible-extract"
+    value: 0.86
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
new file mode 100644
index 0000000000000..436ec21924ca1
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
+model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.624
+  - name: "exact_match,flexible-extract"
+    value: 0.624
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
new file mode 100644
index 0000000000000..dec9164d1b84e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.632
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..43ff2bc5ce35e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.593
+  - name: "exact_match,flexible-extract"
+    value: 0.588
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
new file mode 100644
index 0000000000000..259799ba8bfa9
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
new file mode 100644
index 0000000000000..45d5efc8860f5
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.792
+  - name: "exact_match,flexible-extract"
+    value: 0.824
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
new file mode 100644
index 0000000000000..37eeac85c933b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1,5 @@
+Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14-Instruct.yaml
+DeepSeek-V2-Lite-Chat.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
new file mode 100644
index 0000000000000..1d1b0ed38671d
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,7 @@
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
+Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
new file mode 100644
index 0000000000000..fdb8ec5393b36
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL,parallelize=True \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 0000000000000..de841d959a4e4
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.3
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
new file mode 100644
index 0000000000000..b4fdde6dab425
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
new file mode 100644
index 0000000000000..7fdce7b53bd7f
--- /dev/null
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,55 @@
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
+"""
+
+import os
+from pathlib import Path
+
+import lm_eval
+import numpy
+import yaml
+
+RTOL = 0.02
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+
+
+def launch_lm_eval(eval_config):
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"add_bos_token=true"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto")
+
+    return results
+
+
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)
+
+    # Confirm scores match ground truth.
+    for task in eval_config["tasks"]:
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 4036b32a46bf7..c84e150934306 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,5 +1,6 @@
 # vLLM benchmark suite
 
+
 ## Introduction
 
 This directory contains the performance benchmarking CI for vllm.
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 2b25c954b5c5c..02c0ee534d72c 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -11,7 +11,7 @@ steps:
             - sh
             - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
   - wait
-  - label: "A100 Benchmark"
+  - label: "A100"
     agents:
       queue: A100
     plugins:
@@ -42,21 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       propagate-uid-gid: false
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
 
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
deleted file mode 100755
index 15d411febcee1..0000000000000
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
-  else
-    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
-    exit 0
-  fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
new file mode 100644
index 0000000000000..c3d3cbf473968
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,45 @@
+
+# Nightly benchmark
+
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+
+
+## Docker images
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1
+
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
+
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
+
+## Results
+
+{nightly_results_benchmarking_table}
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
new file mode 100644
index 0000000000000..6e399bb936fbc
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,120 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+  - label: "A100 trt benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.5.0
+                <<: *common_container_settings
+  
+
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:latest 
+                <<: *common_container_settings
+
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+                <<: *common_container_settings
+        
+  - wait
+
+  - label: "Plot"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - wait
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 021473f76d0e5..04b02adf3644c 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -54,7 +54,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -X POST localhost:8000/v1/completions; do
       sleep 1
     done' && return 0 || return 1
 }
@@ -73,8 +73,17 @@ kill_gpu_processes() {
       echo "All GPU processes have been killed."
   fi
 
+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3
+
   # waiting for GPU processes to be fully killed
-  sleep 10
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+    sleep 1
+    echo "Waiting for GPU processes to be killed"
+  done
 
   # remove vllm config file
   rm -rf ~/.config/vllm
@@ -90,12 +99,19 @@ upload_to_buildkite() {
   # upload the benchmarking results to buildkite
 
   # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
 run_latency_tests() {
@@ -269,6 +285,7 @@ run_serving_tests() {
     echo "Running test case $test_name"
     echo "Server command: $server_command"
     eval "$server_command" &
+    server_pid=$!
 
     # wait until the server is alive
     wait_for_server
@@ -318,6 +335,7 @@ run_serving_tests() {
     done
 
     # clean up
+    kill -9 $server_pid
     kill_gpu_processes
   done
 }
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
new file mode 100644
index 0000000000000..627a3e6971578
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    if [[ $gpu_count -gt 0 ]]; then
+        echo "GPU found."
+    else
+        echo "Need at least 1 GPU to run benchmarking."
+        exit 1
+    fi
+    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+    echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+    # check if HF_TOKEN is available and valid
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo "Error: HF_TOKEN is not set."
+        exit 1
+    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+        echo "Error: HF_TOKEN does not start with 'hf_'."
+        exit 1
+    else
+        echo "HF_TOKEN is set and valid."
+    fi
+}
+
+main() {
+
+    check_gpus
+    check_hf_token
+
+    df -h
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    
+
+    # run lmdeploy
+    if which lmdeploy >/dev/null; then
+        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+        exit 0
+    fi
+
+    # run tgi
+    if [ -e /tgi-entrypoint.sh ]; then
+        echo "tgi is available, redirect to run-tgi-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+        exit 0
+    fi
+
+    # run trt
+    if which trtllm-build >/dev/null; then
+        echo "trtllm is available, redirect to run-trt-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+        exit 0
+    fi
+
+    # run vllm
+    if [ -e /vllm-workspace ]; then
+        echo "vllm is available, redirect to run-vllm-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+        exit 0
+    fi
+
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
new file mode 100644
index 0000000000000..68ac5909e5951
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,26 @@
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model",
+                        type=str,
+                        required=True,
+                        help="Name of the model")
+    parser.add_argument("--cachedir",
+                        type=str,
+                        required=True,
+                        help="Directory to save the tokenizer")
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
new file mode 100644
index 0000000000000..18bcc3a8714c4
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,6 @@
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
new file mode 100644
index 0000000000000..f8262653a6628
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+
+    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
+    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+    python ../quantization/quantize.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path} \
+        --qformat fp8 \
+        --kv_cache_dtype fp8 \
+        --calib_size 2
+
+else
+
+    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
+    python3 convert_checkpoint.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path}
+
+fi
+
+
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path} 
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
new file mode 100644
index 0000000000000..1168912c6e229
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # generate figures
+    python3 -m pip install tabulate pandas matplotlib
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+        --description $description \
+        --results-folder results/
+    
+    # upload results and figures
+    /workspace/buildkite-agent artifact upload "nightly_results.png"
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
new file mode 100644
index 0000000000000..e5cfcc64a9b2a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -0,0 +1,135 @@
+import argparse
+import json
+import math
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file, "r") as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description, "r") as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+    plt.rcParams.update({'font.size': 20})
+
+    # plot results
+    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+    fig.subplots_adjust(hspace=1)
+    methods = ["vllm", "trt", "lmdeploy", "tgi"]
+    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
+        for j, metric in enumerate(["TTFT", "ITL"]):
+            means, stds = [], []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    means.append(0.)
+                    stds.append(0.)
+                else:
+                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+                    std = filtered_df[f"Std {metric} (ms)"].values[0]
+                    success = filtered_df["Successful req."].values[0]
+                    stds.append(std / math.sqrt(success))
+
+            print(model, metric)
+            print(means, stds)
+
+            ax = axes[i, j + 1]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                means,
+                yerr=stds,
+                capsize=10,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel(f"{metric} (ms)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+        metric = "Tput"
+        j = 0
+        if True:
+            tputs = []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    tputs.append(0.)
+                else:
+                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+                    tputs.append(input_tput + output_tput)
+
+            print(model, metric)
+            print(tputs)
+
+            ax = axes[i, j]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                tputs,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel("Tput (token/s)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+    fig.tight_layout()
+    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
new file mode 100644
index 0000000000000..d6f112aaa42fd
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # prepare tokenizer
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "lmdeploy server is up and running."
+    else
+      echo ""
+      echo "lmdeploy failed to start within the timeout period."
+      break
+    fi
+
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --model \"$model_name\" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  python -m pip install transformers==4.41.2
+
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
new file mode 100644
index 0000000000000..fed03654f8b77
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
+
+    
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=tgi
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
new file mode 100644
index 0000000000000..4a82b9ec64d71
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill tritonserver || true
+  # waiting for GPU processes to be fully killed
+  sleep 20
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+    echo "Running test case $test_name"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command=""
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "trt" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
+  export CURRENT_LLM_SERVING_ENGINE=trt
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
new file mode 100644
index 0000000000000..663045b8a9122
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=vllm
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
new file mode 100644
index 0000000000000..782d1ef9aab98
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,76 @@
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "input_throughput": "Input Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+
+    serving_md_table_with_headers = tabulate(serving_results,
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write('\n')
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
new file mode 100644
index 0000000000000..f250833c62710
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,116 @@
+[
+    {
+        "test_name": "llama8B_tp1",
+        "qps_list": [4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 1959f9752069f..5be9a553dddd4 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,19 @@
 steps:
-  - block: "Build wheels"
-
-  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
     matrix:
       setup:
         cuda_version:
           - "11.8.0"
           - "12.1.0"
-        python_version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index bde8ab6184d3c..618d712b0279b 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -2,6 +2,15 @@
 set -ex
 
 # Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- ROCm info"
 rocminfo
 
@@ -45,15 +54,10 @@ while true; do
         fi
 done
 
-echo "--- Building container"
-sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
-docker build \
-        -t ${image_name} \
-        -f Dockerfile.rocm \
-        --progress plain \
-        .
+echo "--- Pulling container" 
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull ${image_name}
 
 remove_docker_container() {
    docker rm -f ${container_name} || docker image rm -f ${image_name} || true
@@ -62,11 +66,18 @@ trap remove_docker_container EXIT
 
 echo "--- Running container"
 
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p ${HF_CACHE}
+HF_MOUNT="/root/.cache/huggingface"
+
 docker run \
         --device /dev/kfd --device /dev/dri \
         --network host \
+        --shm-size=16gb \
         --rm \
         -e HF_TOKEN \
+        -v ${HF_CACHE}:${HF_MOUNT} \
+        -e HF_HOME=${HF_MOUNT} \
         --name ${container_name} \
         ${image_name} \
         /bin/bash -c "${@}"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f4fa24be1f20f..a7678aae54644 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -12,8 +12,10 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
@@ -23,4 +25,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   cd ../
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
new file mode 100755
index 0000000000000..7ac4dcc4c786d
--- /dev/null
+++ b/.buildkite/run-multi-node-test.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+set -euox pipefail
+
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    exit 1
+fi
+
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
+
+shift 4
+COMMANDS=("$@")
+if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+    echo "The number of commands must be equal to the number of nodes."
+    echo "Number of nodes: $NUM_NODES"
+    echo "Number of commands: ${#COMMANDS[@]}"
+    exit 1
+fi
+
+echo "List of commands"
+for command in "${COMMANDS[@]}"; do
+    echo $command
+done
+
+start_network() {
+    docker network create --subnet=192.168.10.0/24 docker-net
+}
+
+start_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ $node -eq 0 ]; then
+            # start the ray head node
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
+    done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
+}
+
+run_nodes() {
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        echo "Running node$node with GPU devices: $GPU_DEVICES"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        else
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        fi
+    done
+}
+cleanup() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        docker stop node$node
+    done
+    docker network rm docker-net
+}
+trap cleanup EXIT
+start_network
+start_nodes
+run_nodes
+
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
new file mode 100755
index 0000000000000..70e56596c4a86
--- /dev/null
+++ b/.buildkite/run-openvino-test.sh
@@ -0,0 +1,14 @@
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
new file mode 100644
index 0000000000000..4aabd123ae234
--- /dev/null
+++ b/.buildkite/run-tpu-test.sh
@@ -0,0 +1,16 @@
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+    python3 /workspace/vllm/examples/offline_inference_tpu.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c337a81d4a0d2..e7dd1fdb2e660 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,11 +1,38 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template-aws.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
 
 steps:
+- label: Async Engine, Inputs, Utils, Worker Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
+
+- label: Tensorizer, Metrics, Tracing Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
 - label: Regression Test
   mirror_hardwares: [amd]
+  fast_check: true
   command: pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
@@ -15,16 +42,22 @@ steps:
 
 - label: Basic Correctness Test
   mirror_hardwares: [amd]
+  fast_check: true
   commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
   mirror_hardwares: [amd]
-  command: pytest -v -s core
+  fast_check: true
+  commands:
+  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
@@ -34,24 +67,38 @@ steps:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
 
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -59,26 +106,36 @@ steps:
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
+  fast_check: true
   commands:
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
 
 - label: Entrypoints Test
+  fast_check: true
   mirror_hardwares: [amd]
 
   commands:
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
@@ -88,6 +145,7 @@ steps:
     # install tensorizer for tensorize_vllm_model.py
     - pip install awscli tensorizer
     - python3 offline_inference.py
+    - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
     - python3 llava_example.py
@@ -96,24 +154,25 @@ steps:
 - label: Inputs Test
   #mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
   mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
     - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
@@ -130,7 +189,9 @@ steps:
   command: pytest -v -s test_logits_processor.py
 
 - label: Utils Test
-  command: pytest -v -s test_utils.py
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py
 
 - label: Worker Test
   mirror_hardwares: [amd]
@@ -160,7 +221,10 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  commands:
+    - apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
 
 - label: Metrics Test
   mirror_hardwares: [amd]
@@ -186,8 +250,25 @@ steps:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt
@@ -202,3 +283,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s -x lora/test_mixtral.py
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
deleted file mode 100644
index fb34b787e0cbd..0000000000000
--- a/.buildkite/test-template-aws.j2
+++ /dev/null
@@ -1,139 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        priority: 100
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel Test"
-    depends_on: ~
-    agents:
-      queue: intel
-    command: bash .buildkite/run-cpu-test.sh
-
-  {% for step in steps %}
-  {% if step.gpu == "a100" %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: a100-queue
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: {{ docker_image }}
-            command: ["bash"]
-            args:
-            - '-c'
-            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  {% else %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endif %}
-  {% endfor %}
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000000000..71f4e520135d4
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+github: [vllm-project]
+open_collective: [vllm]
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 62f0dbcd93eff..5780f09a646cb 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,20 +32,22 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
         mypy vllm/attention --config-file pyproject.toml
         mypy vllm/core --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/multimodal --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
         mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy tests --config-file pyproject.toml
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9c35ede5f6781..15c2ec05b25db 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/.gitignore b/.gitignore
index e077366d1e4a1..17184b19127ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# vllm commit id, generated by setup.py
+vllm/commit_id.py
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa15b632cdd3b..bf00a36edc500 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -31,9 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -98,18 +98,11 @@ elseif(HIP_FOUND)
   # .hip extension automatically, HIP must be enabled explicitly.
   enable_language(HIP)
 
-  # ROCm 5.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
-  endif()
-
-  # ROCm 6.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@@ -158,6 +151,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
+  "csrc/prepare_inputs/advance_step.cu"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -178,6 +172,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
diff --git a/Dockerfile b/Dockerfile
index 5b3e682a80169..b9a56e67e8d7b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,10 +8,10 @@
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3
+ARG PYTHON_VERSION=3.10
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -21,13 +21,16 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && apt-get install -y ccache software-properties-common \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
     && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
-    && python3 --version \
-    && python3 -m pip --version
+    && python3 --version
 
 RUN apt-get update -y \
-    && apt-get install -y python3-pip git curl sudo
+    && apt-get install -y git curl sudo
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -43,6 +46,10 @@ COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+COPY requirements-mamba.txt requirements-mamba.txt
+RUN python3 -m pip install packaging
+RUN python3 -m pip install -r requirements-mamba.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -54,7 +61,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 
-ARG PYTHON_VERSION=3
+ARG PYTHON_VERSION=3.10
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -84,6 +91,9 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}
+
 ARG USE_SCCACHE
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
@@ -93,10 +103,15 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         && tar -xzf sccache.tar.gz \
         && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
         && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=vllm-build-sccache \
+        && if [ "$CUDA_VERSION" = "11.8.0" ]; then \
+            export SCCACHE_BUCKET=vllm-build-sccache-2; \
+           else \
+            export SCCACHE_BUCKET=vllm-build-sccache; \
+           fi \
         && export SCCACHE_REGION=us-west-2 \
+        && export CMAKE_BUILD_TYPE=Release \
         && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
     fi
 
@@ -104,7 +119,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist; \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
 # check the size of the wheel, we cannot upload wheels larger than 100MB
@@ -123,15 +138,45 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
 #################### DEV IMAGE ####################
+#################### MAMBA Build IMAGE ####################
+FROM dev as mamba-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+
+WORKDIR /usr/src/mamba
+
+COPY requirements-mamba.txt requirements-mamba.txt
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel -r requirements-mamba.txt \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### MAMBA Build IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
+ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
 
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version
+
 RUN apt-get update -y \
-    && apt-get install -y python3-pip git vim
+    && apt-get install -y python3-pip git vim curl libibverbs-dev
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -143,6 +188,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
+
+RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
@@ -172,7 +224,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 6e55203decc56..f95d748f1e4be 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -6,7 +6,13 @@ RUN apt-get update  -y \
     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
-RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN pip install intel-openmp
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
+
 
 RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
 
@@ -31,4 +37,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-CMD ["/bin/bash"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
new file mode 100644
index 0000000000000..cfb786485266c
--- /dev/null
+++ b/Dockerfile.openvino
@@ -0,0 +1,26 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:20.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
new file mode 100644
index 0000000000000..d4e4c483cada8
--- /dev/null
+++ b/Dockerfile.ppc64le
@@ -0,0 +1,22 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+# These packages will be in rocketce eventually
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 6bda696859c8b..ff39791456398 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,34 +1,33 @@
-# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+# Default ROCm 6.1 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
 
-FROM $BASE_IMAGE
-
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-RUN echo "Base image is $BASE_IMAGE"
-
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 
+# Whether to install CK-based flash-attention
+# If 0, will not install flash-attention
+ARG BUILD_FA="1"
+# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
+# If this succeeds, we use the downloaded wheel and skip building flash-attention.
+# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
+# architectures specified in `FA_GFX_ARCHS`
+ARG TRY_FA_WHEEL="1"
+ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+ARG FA_BRANCH="23a2b1c2"
 
-ARG FA_BRANCH="ae7928c"
-RUN echo "FA_BRANCH is $FA_BRANCH"
+# Whether to build triton on rocm
+ARG BUILD_TRITON="1"
+ARG TRITON_BRANCH="e0fc12c"
 
-# whether to build flash-attention
-# if 0, will not build flash attention
-# this is useful for gfx target where flash-attention is not supported
-# In that case, we need to use the python reference attention implementation in vllm
-ARG BUILD_FA="1"
+### Base image build stage
+FROM $BASE_IMAGE AS base
 
-# whether to build triton on rocm
-ARG BUILD_TRITON="1"
+# Import arg(s) defined before this build stage
+ARG PYTORCH_ROCM_ARCH
 
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
-
-# Install some basic utilities
 RUN apt-get update && apt-get install -y \
     curl \
     ca-certificates \
@@ -39,79 +38,152 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     wget \
     unzip \
-    nvidia-cuda-toolkit \
     tmux \
     ccache \
  && rm -rf /var/lib/apt/lists/*
 
-### Mount Point ###
-# When launching the container, mount the code directory to /app
+# When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace
-VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+# Install torch == 2.5.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.1"*) \
+            python3 -m pip uninstall -y torch torchaudio torchvision \
+            && python3 -m pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+        *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 
-# Install ROCm flash-attention
-RUN if [ "$BUILD_FA" = "1" ]; then \
-    mkdir libs \
-    && cd libs \
-    && git clone https://github.com/ROCm/flash-attention.git \
-    && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
-    && git submodule update --init \
-    && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
-    && python3 setup.py install \
-    && cd ..; \
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV CCACHE_DIR=/root/.cache/ccache
+
+
+### AMD-SMI build stage
+FROM base AS build_amdsmi
+# Build amdsmi wheel always
+RUN cd /opt/rocm/share/amd_smi \
+    && python3 -m pip wheel . --wheel-dir=/install
+
+
+### Flash-Attention wheel build stage
+FROM base AS build_fa
+ARG BUILD_FA
+ARG TRY_FA_WHEEL
+ARG FA_WHEEL_URL
+ARG FA_GFX_ARCHS
+ARG FA_BRANCH
+# Build ROCm flash-attention wheel if `BUILD_FA = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_FA" = "1" ]; then \
+        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
+            # If a suitable wheel exists, we download it instead of building FA
+            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
+        else \
+            mkdir -p libs \
+            && cd libs \
+            && git clone https://github.com/ROCm/flash-attention.git \
+            && cd flash-attention \
+            && git checkout "${FA_BRANCH}" \
+            && git submodule update --init \
+            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+        fi; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
-# build triton
-RUN if [ "$BUILD_TRITON" = "1" ]; then \
+### Triton wheel build stage
+FROM base AS build_triton
+ARG BUILD_TRITON
+ARG TRITON_BRANCH
+# Build triton wheel if `BUILD_TRITON = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_TRITON" = "1" ]; then \
     mkdir -p libs \
     && cd libs \
-    && pip uninstall -y triton \
-    && git clone https://github.com/ROCm/triton.git \
-    && cd triton/python \
-    && pip3 install . \
-    && cd ../..; \
+    && git clone https://github.com/OpenAI/triton.git \
+    && cd triton \
+    && git checkout "${TRITON_BRANCH}" \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-WORKDIR /vllm-workspace
+
+### Final vLLM build stage
+FROM base AS final
+# Import the vLLM development directory from the build context
 COPY . .
 
-#RUN python3 -m pip install pynvml # to be removed eventually
-RUN python3 -m pip install --upgrade pip numba
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
+
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 
-# make sure punica kernels are built (for LoRA)
+# Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+# Silences the HF Tokenizers warning
+ENV TOKENIZERS_PARALLELISM=false
 
-ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
+RUN --mount=type=cache,target=${CCACHE_DIR} \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install -U -r requirements-rocm.txt \
-    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
-    && python3 setup.py install \
-    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
-    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
-    && cd ..
+    python3 -m pip install -Ur requirements-rocm.txt \
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.1"*) \
+            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
+            # Prevent interference if torch bundles its own HIP runtime
+            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
+        *) ;; esac \
+    && python3 setup.py clean --all \
+    && python3 setup.py develop
+
+# Copy amdsmi wheel into final image
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
+    mkdir -p libs \
+    && cp /install/*.whl libs \
+    # Preemptively uninstall to avoid same-version no-installs
+    && python3 -m pip uninstall -y amdsmi;
 
+# Copy triton wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && python3 -m pip uninstall -y triton; fi
+
+# Copy flash-attn wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && python3 -m pip uninstall -y flash-attn; fi
+
+# Install wheels that were built to the final image
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if ls libs/*.whl; then \
+    python3 -m pip install libs/*.whl; fi
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 931c844c08dce..be7dbe63cb237 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,19 +1,20 @@
-ARG NIGHTLY_DATE="20240601"
+ARG NIGHTLY_DATE="20240713"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
-
 WORKDIR /workspace
-COPY . /workspace/vllm
 
-ENV VLLM_TARGET_DEVICE="tpu"
 # Install aiohttp separately to avoid build errors.
 RUN pip install aiohttp
+# Install NumPy 1 instead of NumPy 2.
+RUN pip install "numpy<2"
 # Install the TPU and Pallas dependencies.
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
 # Build vLLM.
+COPY . /workspace/vllm
+ENV VLLM_TARGET_DEVICE="tpu"
 RUN cd /workspace/vllm && python setup.py develop
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 822363161be2b..4462ce8a59c21 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -120,6 +120,7 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=.git,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
         CMAKE_BUILD_TYPE=Release \
@@ -153,9 +154,6 @@ ENV PATH=$VIRTUAL_ENV/bin/:$PATH
 RUN microdnf install -y gcc \
     && microdnf clean all
 
-# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043)
-COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py
-
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
@@ -166,12 +164,14 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
     cd /usr/src/libsodium \
     && make install
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \
     VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork \
-    TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager"
+    VLLM_WORKER_MULTIPROC_METHOD=fork
 
 # setup non-root user for OpenShift
 RUN umask 002 \
@@ -181,7 +181,7 @@ RUN umask 002 \
 COPY LICENSE /licenses/vllm.md
 
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
 FROM vllm-openai as vllm-grpc-adapter
@@ -189,8 +189,8 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.1.3
+    pip install vllm-tgis-adapter==0.2.3
 
 ENV GRPC_PORT=8033
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"]
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index c39e551672d20..f91baa11a3753 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
diff --git a/OWNERS b/OWNERS
index dc965385e1863..09b25dab41c00 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,17 +1,27 @@
 approvers:
   - dtrifiro
+  - fialhocoelho
   - heyselbi
-  - rpancham
+  - joerunde
+  - maxdebayser
+  - njhill
+  - prashantgupta24
   - RH-steve-grubb
+  - rpancham
   - terrytangyuan
   - vaibhavjainwiz
-  - Xaenalt
   - z103cb
+  - Xaenalt
 reviewers:
   - dtrifiro
+  - fialhocoelho
   - heyselbi
-  - rpancham
+  - joerunde
+  - maxdebayser
+  - njhill
+  - prashantgupta24
   - RH-steve-grubb
+  - rpancham
   - terrytangyuan
   - vaibhavjainwiz
   - Xaenalt
diff --git a/README.md b/README.md
index c24768bf78173..8e508195cdceb 100644
--- a/README.md
+++ b/README.md
@@ -16,27 +16,22 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**Ray Summit CPF is Open (June 4th to June 20th)!**
+**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)**
 
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
+We are excited to announce our fifth vLLM Meetup!
+Join us to hear the vLLM's recent updates and the upcoming roadmap.
+Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM.
+Register now [here](https://lu.ma/lp0gyjqr) and be part of the event!
 
 ---
 
 *Latest News* 🔥
+- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2024/01] Added ROCm 6.0 support to vLLM.
-- [2023/12] Added ROCm 5.7 support to vLLM.
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
-- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
-- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 
 ---
@@ -52,14 +47,16 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+
 vLLM is flexible and easy to use with:
 
 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor parallelism support for distributed inference
+- Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support
 
@@ -103,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Databricks
 - DeepInfra
 - Dropbox
+- Google Cloud
 - Lambda Lab
 - NVIDIA
 - Replicate
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4350b96b04a6a..fbab547d094fe 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -225,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
-        "v1/completions"
-    ), "OpenAI Completions API URL must end with 'v1/completions'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
@@ -265,6 +265,9 @@ async def async_request_openai_completions(
                         else:
                             data = json.loads(chunk)
 
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
@@ -273,12 +276,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
 
                                 # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
@@ -305,8 +304,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
-        "v1/chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
@@ -391,17 +390,17 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text
 
 
-def get_model(pretrained_model_name_or_path: str):
+def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
-    else:
-        from huggingface_hub import snapshot_download
 
-    model_path = snapshot_download(
-        model_id=pretrained_model_name_or_path,
-        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-    return model_path
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+        return model_path
+    return pretrained_model_name_or_path
 
 
 def get_tokenizer(
@@ -423,4 +422,5 @@ def get_tokenizer(
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
 }
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a4cf0632b7790..97afd301c8f24 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,7 +11,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptStrictInputs
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
 
@@ -25,6 +25,8 @@ def main(args: argparse.Namespace):
         model=args.model,
         speculative_model=args.speculative_model,
         num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
         tokenizer=args.tokenizer,
         quantization=args.quantization,
         tensor_parallel_size=args.tensor_parallel_size,
@@ -44,6 +46,7 @@ def main(args: argparse.Namespace):
         load_format=args.load_format,
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
+        enable_prefix_caching=args.enable_prefix_caching,
     )
 
     sampling_params = SamplingParams(
@@ -58,7 +61,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_inputs: List[PromptStrictInputs] = [{
+    dummy_inputs: List[PromptInputs] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
@@ -127,6 +130,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
@@ -201,9 +208,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,
@@ -213,6 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help='If True, the prefill requests can be chunked based on the '
         'max_num_batched_tokens')
+    parser.add_argument("--enable-prefix-caching",
+                        action='store_true',
+                        help="Enable automatic prefix caching")
     parser.add_argument('--use-v2-block-manager', action='store_true')
     parser.add_argument(
         "--ray-workers-use-nsight",
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 42867fc40edd2..fc0dbf77f16b9 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -2,8 +2,8 @@
 
 On the server side, run one of the following commands:
     vLLM OpenAI API server
-    python -m vllm.entrypoints.openai.api_server \
-        --model <your_model> --swap-space 16 \
+    vllm serve <your_model> \
+        --swap-space 16 \
         --disable-log-requests
 
     (TGI backend)
@@ -17,7 +17,7 @@
         --dataset-path <path to dataset> \
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
-        
+
     when using tgi backend, add
         --endpoint /generate_stream
     to the end of the command above.
@@ -60,12 +60,15 @@ class BenchmarkMetrics:
     output_throughput: float
     mean_ttft_ms: float
     median_ttft_ms: float
+    std_ttft_ms: float
     p99_ttft_ms: float
     mean_tpot_ms: float
     median_tpot_ms: float
+    std_tpot_ms: float
     p99_tpot_ms: float
     mean_itl_ms: float
     median_itl_ms: float
+    std_itl_ms: float
     p99_itl_ms: float
 
 
@@ -77,7 +80,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
-
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -185,6 +187,31 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_random_requests(
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(num_prompts):
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    return input_requests
+
+
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
@@ -196,6 +223,7 @@ async def get_request(
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             continue
+
         # Sample the request interval from the exponential distribution.
         interval = np.random.exponential(1.0 / request_rate)
         # The next request will be sent after the interval.
@@ -219,7 +247,7 @@ def calculate_metrics(
             # We use the tokenizer to count the number of output tokens for all
             # serving backends instead of looking at len(outputs[i].itl) since
             # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
+            # Note : this may inflate the output token count slightly
             output_len = len(
                 tokenizer(outputs[i].generated_text,
                           add_special_tokens=False).input_ids)
@@ -249,12 +277,15 @@ def calculate_metrics(
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
         mean_itl_ms=np.mean(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
 
@@ -371,12 +402,15 @@ async def benchmark(
         "output_throughput": metrics.output_throughput,
         "mean_ttft_ms": metrics.mean_ttft_ms,
         "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
         "p99_ttft_ms": metrics.p99_ttft_ms,
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
         "mean_itl_ms": metrics.mean_itl_ms,
         "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
@@ -456,6 +490,15 @@ def main(args: argparse.Namespace):
                               for prompt, prompt_formatted, prompt_len,
                               output_len in input_requests]
 
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+        )
+
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
 
@@ -549,7 +592,7 @@ def main(args: argparse.Namespace):
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument("--dataset-path",
@@ -566,7 +609,7 @@ def main(args: argparse.Namespace):
         "--tokenizer",
         type=str,
         help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument(
         "--best-of",
@@ -609,6 +652,27 @@ def main(args: argparse.Namespace):
         help=
         "Number of prefix tokens per request, used only for sonnet dataset.",
     )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
     parser.add_argument(
         "--request-rate",
         type=float,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 2c6beb4e89672..a52e67bbbe7e3 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -349,9 +349,10 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 377f8683c021f..234c2c8a1074c 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -20,18 +20,18 @@
 # helpers
 
 
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
     finfo = torch.finfo(torch.float8_e4m3fn)
     return torch.round(tensor.clamp(
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 
 
-def to_int8(tensor: torch.tensor) -> torch.tensor:
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
@@ -47,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 # impl
 
 
-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                    scale_b: torch.tensor,
-                    out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: torch.dtype) -> torch.Tensor:
     return torch.mm(a, b)
 
 
-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                     scale_b: torch.tensor,
-                     out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: torch.dtype) -> torch.Tensor:
     return torch._scaled_mm(a,
                             b,
                             scale_a=scale_a,
@@ -63,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                             out_dtype=out_dtype)
 
 
-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
-                                scale_a: torch.tensor, scale_b: torch.tensor,
-                                out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
+                                out_dtype: torch.dtype) -> torch.Tensor:
     return torch._scaled_mm(a,
                             b,
                             scale_a=scale_a,
@@ -74,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
                             use_fast_accum=True)
 
 
-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                 scale_b: torch.tensor,
-                 out_dtype: torch.dtype) -> torch.tensor:
+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                 scale_b: torch.Tensor,
+                 out_dtype: torch.dtype) -> torch.Tensor:
     return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 
 
 # bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
              sub_label: str, fn: Callable, description: str) -> TMeasurement:
 
     min_run_time = 1
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 261f5829631ee..3da4cecd7eeff 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -5,14 +5,16 @@
 from benchmark_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
 from vllm.utils import FlexibleArgumentParser
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 16de60477c305..78cac8a555d1b 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -100,7 +100,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
         start_time = time.perf_counter()
 
         # Using default kv_scale
-        kv_scale = 1.0
+        k_scale = v_scale = 1.0
 
         for _ in range(num_iters):
             if version == "v1":
@@ -117,7 +117,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -136,7 +137,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 511e443f78403..690559ee265e9 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -46,6 +46,8 @@ is_avx512_disabled(AVX512_DISABLED)
 
 find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -68,8 +70,15 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+elseif (POWER9_FOUND OR POWER10_FOUND)
+    message(STATUS "PowerPC detected")
+    # Check for PowerPC VSX support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvsx"
+        "-mcpu=native"
+        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 071e16336dfa2..4869cad541135 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
   if (${GPU_LANG} STREQUAL "HIP")
     #
     # `GPU_ARCHES` controls the `--offload-arch` flags.
-    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
-    # via the `PYTORCH_ROCM_ARCH` env variable.
     #
-
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
     #
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
-
-    set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
-
     set(${GPU_ARCHES})
-    foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()
@@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 
     if(NOT ${GPU_ARCHES})
       message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
         " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
     endif()
 
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 91083481705cb..350dbce1d7ba9 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -105,9 +105,9 @@ __device__ void paged_attention_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   const int seq_idx = blockIdx.y;
   const int partition_idx = blockIdx.z;
   const int max_num_partitions = gridDim.z;
@@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
           Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
               k_ptr + offset1 * BLOCK_SIZE * x + offset2);
           k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
-              k_vec_quant, kv_scale);
+              k_vec_quant, k_scale);
         }
       }
 
@@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
               *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
           // Vector conversion from V_quant_vec to V_vec.
           v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
-                                                                    kv_scale);
+                                                                    v_scale);
         }
         if (block_idx == num_seq_blocks - 1) {
           // NOTE(woosuk): When v_vec contains the tokens that are out of the
@@ -513,15 +513,15 @@ __global__ void paged_attention_v1_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                          KV_DTYPE, IS_BLOCK_SPARSE>(
       /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
       v_cache, num_kv_heads, scale, block_tables, seq_lens,
       max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
-      kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+      kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
       blocksparse_vert_stride, blocksparse_block_size,
       blocksparse_head_sliding_step);
 }
@@ -549,14 +549,14 @@ __global__ void paged_attention_v2_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                          KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
       exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
       block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
-      kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+      kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
       blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
       blocksparse_head_sliding_step);
 }
@@ -682,7 +682,7 @@ __global__ void paged_attention_v2_reduce_kernel(
           out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
           scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
           alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          kv_scale, tp_rank, blocksparse_local_blocks,                      \
+          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
           blocksparse_vert_stride, blocksparse_block_size,                  \
           blocksparse_head_sliding_step);
 
@@ -694,8 +694,8 @@ void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
@@ -770,7 +770,7 @@ void paged_attention_v1_launcher(
   paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
                               IS_BLOCK_SPARSE>(                              \
       out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank,                \
+      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
       blocksparse_local_blocks, blocksparse_vert_stride,                     \
       blocksparse_block_size, blocksparse_head_sliding_step);
 
@@ -815,8 +815,8 @@ void paged_attention_v1(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
@@ -833,7 +833,7 @@ void paged_attention_v1(
           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
           value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
           seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, kv_scale, tp_rank,                  \
+          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
           blocksparse_local_blocks, blocksparse_vert_stride,                   \
           blocksparse_block_size, blocksparse_head_sliding_step);              \
   vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
@@ -850,8 +850,8 @@ void paged_attention_v2_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
@@ -932,8 +932,9 @@ void paged_attention_v2_launcher(
                               IS_BLOCK_SPARSE>(                               \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
       num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
-      kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,   \
-      blocksparse_block_size, blocksparse_head_sliding_step);
+      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
+      blocksparse_vert_stride, blocksparse_block_size,                        \
+      blocksparse_head_sliding_step);
 
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
   switch (is_block_sparse) {                                               \
@@ -980,8 +981,8 @@ void paged_attention_v2(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
diff --git a/csrc/cache.h b/csrc/cache.h
index 86caa9345361d..52177e8901a89 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -18,8 +18,8 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       const double kv_scale);
+                       const std::string& kv_cache_dtype, const double k_scale,
+                       const double v_scale);
 
 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 72041076ae009..caef7f5e18630 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel(
                                          // block_size]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x,
-    const float kv_scale) {
+    const int head_size, const int block_size, const int x, const float k_scale,
+    const float v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   if (slot_idx < 0) {
@@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel(
       value_cache[tgt_value_idx] = tgt_value;
     } else {
       key_cache[tgt_key_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, kv_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
       value_cache[tgt_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, kv_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
     }
   }
 }
@@ -248,7 +248,7 @@ __global__ void reshape_and_cache_flash_kernel(
           reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
           reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
           slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
-          num_heads, head_size, block_size, x, kv_scale);
+          num_heads, head_size, block_size, x, k_scale, v_scale);
 
 void reshape_and_cache(
     torch::Tensor& key,    // [num_tokens, num_heads, head_size]
@@ -258,7 +258,8 @@ void reshape_and_cache(
     torch::Tensor&
         value_cache,  // [num_blocks, num_heads, head_size, block_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const double kv_scale) {
+    const std::string& kv_cache_dtype, const double k_scale,
+    const double v_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
@@ -318,13 +319,13 @@ namespace vllm {
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
 __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
                                    Tout* __restrict__ dst_cache,
-                                   const float kv_scale,
+                                   const float scale,
                                    const int64_t block_stride) {
   const int64_t block_idx = blockIdx.x;
   for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
     int64_t idx = block_idx * block_stride + i;
     dst_cache[idx] =
-        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], kv_scale);
+        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], scale);
   }
 }
 
@@ -333,11 +334,11 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
 #define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE)                                \
   vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
       reinterpret_cast<Tin*>(src_cache.data_ptr()),                          \
-      reinterpret_cast<Tout*>(dst_cache.data_ptr()), kv_scale, block_stride);
+      reinterpret_cast<Tout*>(dst_cache.data_ptr()), scale, block_stride);
 
 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double kv_scale, const std::string& kv_cache_dtype) {
+                 const double scale, const std::string& kv_cache_dtype) {
   torch::Device src_device = src_cache.device();
   torch::Device dst_device = dst_cache.device();
   TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index 8367093325314..abb4e3bea14bb 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -423,11 +423,11 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(kv_scale == 1.0f);
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
@@ -742,11 +742,11 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(kv_scale == 1.0f);
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 2b5c3bd6ee70b..31d454328b2c1 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -107,8 +107,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double kv_scale) {
-  TORCH_CHECK(kv_scale == 1.0f);
+                       const std::string& kv_cache_dtype, double k_scale,
+                       double v_scale) {
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
 
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index d7621aaae81c9..0213be09105ed 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -2,514 +2,14 @@
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
-#include <immintrin.h>
-#include <torch/all.h>
-
-#ifndef __AVX2__
-static_assert(false, "AVX2 must be supported for the current implementation.");
-#endif
-
-namespace vec_op {
-
-// FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-#ifdef __AVX512FP16__
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128h reg;
-
-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
-
-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
-
-  explicit FP16Vec8(__m128h data) : reg(data) {}
-
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_add_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_div_ph(reg, b.reg));
-  }
-
-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
-};
-#endif
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128i reg;
-
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
-
-  explicit BF16Vec8(const FP32Vec8 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  __m256i reg;
-
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
-
-  explicit BF16Vec16(const FP32Vec16 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
-};
-
-#ifdef __AVX512F__
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m512i reg;
-
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
-
-  explicit BF16Vec32(__m512i data) : reg(data) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg((__m512i)_mm512_inserti32x4(
-            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
-                                                      (__m128i)vec8_data.reg),
-                                                  (__m128i)vec8_data.reg, 1),
-                               (__m128i)vec8_data.reg, 2),
-            (__m128i)vec8_data.reg, 3)) {}
-
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
-};
-#else
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m256i reg_low;
-  __m256i reg_high;
-
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
-
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
-
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
-  }
-};
-#endif
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  union AliasReg {
-    __m128 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m128 reg;
-
-  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
-
-  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
-
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
-
-  explicit FP32Vec4(__m128 data) : reg(data) {}
-
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  union AliasReg {
-    __m256 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m256 reg;
-
-  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
-
-  explicit FP32Vec8(__m256 data) : reg(data) {}
-
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-
-#ifdef __AVX512FP16__
-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
-
-  explicit FP32Vec8(const BF16Vec8 &v)
-      : reg(_mm256_castsi256_ps(
-            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
-                                  expf(ar.values[5]), expf(ar.values[4]),
-                                  expf(ar.values[3]), expf(ar.values[2]),
-                                  expf(ar.values[1]), expf(ar.values[0])));
-  }
-
-  FP32Vec8 tanh() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
-                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
-                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
-                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
-  }
-
-  FP32Vec8 er() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
-                                  erf(ar.values[5]), erf(ar.values[4]),
-                                  erf(ar.values[3]), erf(ar.values[2]),
-                                  erf(ar.values[1]), erf(ar.values[0])));
-  }
-
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_add_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_div_ps(reg, b.reg));
-  }
-
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
-};
-
-#ifdef __AVX512F__
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    __m512 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m512 reg;
-
-  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
-
-  explicit FP32Vec16(__m512 data) : reg(data) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg((__m512)_mm512_inserti32x4(
-            _mm512_inserti32x4(
-                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
-                                   (__m128i)data.reg, 1),
-                (__m128i)data.reg, 2),
-            (__m128i)data.reg, 3)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg((__m512)_mm512_inserti32x8(
-            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v)
-      : reg(_mm512_castsi512_ps(
-            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_add_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_div_ps(reg, b.reg));
-  }
-
-  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
-};
+#if defined(__x86_64__)
+  //x86 implementation
+  #include "cpu_types_x86.hpp"
+#elif defined(__POWER9_VECTOR__)
+  //ppc implementation
+  #include "cpu_types_vsx.hpp"
 #else
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  union AliasReg {
-    __m256 reg;
-    float values[8];
-  };
-
-  __m256 reg_low;
-  __m256 reg_high;
-
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
-
-  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
-        reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg_low(data.reg), reg_high(data.reg) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v) {
-    __m128i low = _mm256_extractf128_si256(v.reg, 0);
-    __m128i high = _mm256_extractf128_si256(v.reg, 1);
-
-    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
-    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
-
-    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
-    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
-
-    reg_low = _mm256_castsi256_ps(v_low_shifted);
-    reg_high = _mm256_castsi256_ps(v_high_shifted);
-  }
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
-                     _mm256_mul_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
-                     _mm256_add_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
-                     _mm256_sub_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
-                     _mm256_div_ps(reg_high, b.reg_high));
-  }
-
-  float reduce_sum() const {
-    FP32Vec8 low = FP32Vec8(reg_low);
-    FP32Vec8 high = FP32Vec8(reg_high);
-    return low.reduce_sum() + high.reduce_sum();
-  }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    float sum = 0.0;
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    uint32_t mask = base_mask << (idx * group_size);
-
-    AliasReg ar;
-
-    auto func = [&sum, &mask, &ar](int i) {
-      int flag = mask & 0x1;
-      mask = mask >> 1;
-      if (flag != 0) sum += ar.values[i];
-    };
-
-    ar.reg = reg_low;
-    unroll_loop<int, 8>(func);
-
-    ar.reg = reg_high;
-    unroll_loop<int, 8>(func);
-
-    return sum;
-  }
-
-  void save(float *ptr) const {
-    _mm256_storeu_ps(ptr, reg_low);
-    _mm256_storeu_ps(ptr + 8, reg_high);
-  }
-};
-#endif
-
-template <typename T> struct VecType { using vec_type = void; };
-
-template <typename T> using vec_t = typename VecType<T>::vec_type;
-
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
-
-#ifdef __AVX512FP16__
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+  #warning "unsupported vLLM cpu implementation"
 #endif
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
-
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
-
-#ifdef __AVX512FP16__
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-  acc = acc + a * b;
-}
-
-#ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
-
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
-}
-#else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(_mm256_cvtepi32_epi16(
-          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg(_mm512_cvtepi32_epi16(
-          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
-__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
-  __m256i ai = _mm256_castps_si256(a);
-  ai = _mm256_srli_epi32(ai, 16);
-  ai = _mm256_packus_epi32(ai, ai);
-  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
-  return _mm256_extracti128_si256(ai, 0);
-}
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
-  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
-  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
-  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
-}
-#endif // __AVX512F__
-#endif // __AVX512BF16__
-
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
-
-}; // namespace vec_op
-
 #endif
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
new file mode 100644
index 0000000000000..b50bdadc5713d
--- /dev/null
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -0,0 +1,491 @@
+
+#ifndef CPU_TYPES_VSX_HPP
+#define CPU_TYPES_VSX_HPP
+
+#include <altivec.h>
+#include <cmath>
+#include <torch/all.h>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short *)ptr);
+    vec_xst(reg.val[1], 16, (signed short *)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {}
+
+  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_mul(reg.val[0], b.reg.val[0]),
+        vec_mul(reg.val[1], b.reg.val[1]),
+        vec_mul(reg.val[2], b.reg.val[2]),
+        vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_add(reg.val[0], b.reg.val[0]),
+        vec_add(reg.val[1], b.reg.val[1]),
+        vec_add(reg.val[2], b.reg.val[2]),
+        vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_sub(reg.val[0], b.reg.val[0]),
+        vec_sub(reg.val[1], b.reg.val[1]),
+        vec_sub(reg.val[2], b.reg.val[2]),
+        vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_div(reg.val[0], b.reg.val[0]),
+        vec_div(reg.val[1], b.reg.val[1]),
+        vec_div(reg.val[2], b.reg.val[2]),
+        vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+#define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#ifndef _ARCH_PWR10
+const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
+const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
+const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
+const static __vector unsigned int one  = { 1, 1, 1, 1 };
+#endif
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[2];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  reg = vec_perm(ret[0], ret[1], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+#endif
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[4];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
+  reg.val[0] = vec_perm(ret[0], ret[1], omask);
+  reg.val[1] = vec_perm(ret[2], ret[3], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
+  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  lsb2 = vec_and(lsb2, one);
+  lsb3 = vec_and(lsb3, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  __vector unsigned int rnd2 = vec_add(lsb2, bias);
+  __vector unsigned int rnd3 = vec_add(lsb3, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  inp2 = vec_add(inp2, rnd2);
+  inp3 = vec_add(inp3, rnd3);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  inp2 = vec_sr(inp2, sh16);
+  inp3 = vec_sr(inp3, sh16);
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+#endif
+}
+
+inline void prefetch(const void *addr) {
+  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
+}
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
new file mode 100644
index 0000000000000..f50620a5287d4
--- /dev/null
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -0,0 +1,515 @@
+
+#ifndef CPU_TYPES_X86_HPP
+#define CPU_TYPES_X86_HPP
+
+#include <immintrin.h>
+#include <torch/all.h>
+
+#ifndef __AVX2__
+static_assert(false, "AVX2 must be supported for the current implementation.");
+#endif
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+#ifdef __AVX512FP16__
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128h reg;
+
+  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+
+  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+
+  explicit FP16Vec8(__m128h data) : reg(data) {}
+
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_add_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_div_ph(reg, b.reg));
+  }
+
+  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+};
+#endif
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+};
+
+#ifdef __AVX512F__
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m512i reg;
+
+  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+
+  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+};
+#else
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m256i reg_low;
+  __m256i reg_high;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+
+  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
+                                                  reg_high(high) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg_low((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)),
+        reg_high((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)) {}
+
+  void save(void *ptr) const {
+    *reinterpret_cast<__m256i *>(ptr) = reg_low;
+    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  }
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m128 reg;
+
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m256 reg;
+
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+
+#ifdef __AVX512FP16__
+  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+#endif
+
+  explicit FP32Vec8(const BF16Vec8 &v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+
+  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+
+#ifdef __AVX512F__
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m512 reg;
+
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+
+  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+};
+#else
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  union AliasReg {
+    __m256 reg;
+    float values[8];
+  };
+
+  __m256 reg_low;
+  __m256 reg_high;
+
+  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
+                                reg_high(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
+                         reg_high(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
+                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+
+  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
+                                              reg_high(data.reg_high) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg_low((__m256)_mm256_inserti128_si256(
+                _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)),
+        reg_high((__m256)_mm256_inserti128_si256(
+                 _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg_low(data.reg), reg_high(data.reg) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
+    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
+
+    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
+    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
+
+    reg_low = _mm256_castsi256_ps(v_low_shifted);
+    reg_high = _mm256_castsi256_ps(v_high_shifted);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
+                     _mm256_mul_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
+                     _mm256_add_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
+                     _mm256_sub_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
+                     _mm256_div_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_sum() const {
+    FP32Vec8 low = FP32Vec8(reg_low);
+    FP32Vec8 high = FP32Vec8(reg_high);
+    return low.reduce_sum() + high.reduce_sum();
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    float sum = 0.0;
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    uint32_t mask = base_mask << (idx * group_size);
+
+    AliasReg ar;
+
+    auto func = [&sum, &mask, &ar](int i) {
+      int flag = mask & 0x1;
+      mask = mask >> 1;
+      if (flag != 0) sum += ar.values[i];
+    };
+
+    ar.reg = reg_low;
+    unroll_loop<int, 8>(func);
+
+    ar.reg = reg_high;
+    unroll_loop<int, 8>(func);
+
+    return sum;
+  }
+
+  void save(float *ptr) const {
+    _mm256_storeu_ps(ptr, reg_low);
+    _mm256_storeu_ps(ptr + 8, reg_high);
+  }
+};
+#endif
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+#ifdef __AVX512FP16__
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+#endif
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+#ifdef __AVX512FP16__
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<_Float16 *>(ptr) = v;
+}
+#endif
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+#ifdef __AVX512BF16__
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+#else
+namespace{
+__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
+  __m256i ai = _mm256_castps_si256(a);
+  ai = _mm256_srli_epi32(ai, 16);
+  ai = _mm256_packus_epi32(ai, ai);
+  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
+  return _mm256_extracti128_si256(ai, 0);
+}
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
+  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
+  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
+}
+#endif // __AVX512F__
+#endif // __AVX512BF16__
+
+inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 39e8cf3ed3c10..5be0e9810b5b9 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -16,8 +16,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
@@ -30,8 +30,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
@@ -103,7 +103,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
+      "                  float k_scale, float v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 6f0a7143c9169..9ef1fcb465bf3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <optional>
 #include <torch/library.h>
 
 void paged_attention_v1(
@@ -7,8 +8,8 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
@@ -18,8 +19,8 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
@@ -51,6 +52,11 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
 
+void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
@@ -83,20 +89,30 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   int64_t size_k);
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                               torch::Tensor& b_scales, torch::Tensor& g_idx,
-                               torch::Tensor& perm, torch::Tensor& workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full);
+                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
+                               torch::Tensor& g_idx, torch::Tensor& perm,
+                               torch::Tensor& workspace, int64_t num_bits,
+                               int64_t size_m, int64_t size_n, int64_t size_k,
+                               bool is_k_full, bool has_zp);
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
+                                int64_t size_n, int64_t num_bits);
+
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k);
+
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales);
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias);
 
 #endif
 
@@ -116,12 +132,16 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
-void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
-                             torch::Tensor& scale);
+void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                             torch::Tensor const& scale);
 
-void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
+void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor& scale);
 
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
+    c10::optional<torch::Tensor> const& scale_ub);
+
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
new file mode 100644
index 0000000000000..0e537ddd6c4cd
--- /dev/null
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -0,0 +1,131 @@
+/*
+ * The goal of this GPU kernel is to advance input tensors on the GPU directly
+ * PR: https://github.com/vllm-project/vllm/pull/6338
+ * Current restrictions:
+ *     1. Specialized for DraftModelRunner
+ *     2. Supports flash_attn only
+ */
+
+#include "advance_step.cuh"
+
+namespace prepare_inputs {
+
+//
+template <int const num_threads>
+__global__ void advance_step_kernel(int num_seqs, int num_queries,
+                                    int block_size, long* input_tokens_ptr,
+                                    long const* sampled_token_ids_ptr,
+                                    long* input_positions_ptr,
+                                    int* seq_lens_ptr, long* slot_mapping_ptr,
+                                    int const* block_tables_ptr,
+                                    int64_t const block_tables_stride) {
+  int num_query_blocks = div_ceil(num_queries, num_threads);
+
+  if (blockIdx.x >= num_query_blocks) {
+    return;
+  }
+
+  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
+
+  if (cur_query_id >= num_queries) {
+    return;
+  }
+
+  // Update input_tokens
+  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
+
+  int seq_len = seq_lens_ptr[cur_query_id];
+  int next_seq_len = seq_len + 1;
+  int next_input_pos = next_seq_len - 1;
+
+  // Update seq_lens
+  seq_lens_ptr[cur_query_id] = next_seq_len;
+  // Update input_positions
+  input_positions_ptr[cur_query_id] = next_input_pos;
+
+  int const* seq_block_tables_ptr =
+      block_tables_ptr + block_tables_stride * cur_query_id;
+
+  int block_index = next_input_pos / block_size;
+  int block_offset = next_input_pos % block_size;
+
+  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
+  // Update slot_mapping
+  slot_mapping_ptr[cur_query_id] = slot_num;
+}
+
+inline void verify_tensor(std::string const& name, torch::Tensor& t,
+                          int64_t const size_0, int64_t const size_1,
+                          c10::ScalarType const type) {
+  bool size_0_cond = true;
+  if (size_0 != -1) {
+    size_0_cond = t.size(0) == size_0;
+  }
+
+  bool size_1_cond = true;
+  if (size_1 != -1) {
+    size_1_cond = t.size(1) == size_1;
+  }
+
+  bool is_contiguous = t.is_contiguous();
+  bool same_type = t.dtype() == type;
+
+  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
+  if (!pass) {
+    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
+                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
+                " is not as expected: shape = [", size_0, ", ", size_1,
+                "], type = ", type);
+  }
+}
+
+void advance_step(int num_seqs, int num_queries, int block_size,
+                  torch::Tensor& input_tokens,       // type: long
+                  torch::Tensor& sampled_token_ids,  // type: long
+                  torch::Tensor& input_positions,    // type: long
+                  torch::Tensor& seq_lens,           // type: int
+                  torch::Tensor& slot_mapping,       // type: long
+                  torch::Tensor& block_tables) {     // type: int
+
+  if (logging) {
+    printf("advance_step:\n");
+    printf("  num_seqs = %d\n", num_seqs);
+    printf("  num_queries = %d\n", num_queries);
+    printf("  block_size = %d\n", block_size);
+  }
+  // Verify all tensors
+  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
+  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
+                at::kLong);
+  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
+  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
+  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
+  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
+
+  int dev = sampled_token_ids.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
+      num_seqs, num_queries, block_size,
+      reinterpret_cast<long*>(input_tokens.data_ptr()),
+      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
+      reinterpret_cast<long*>(input_positions.data_ptr()),
+      reinterpret_cast<int*>(seq_lens.data_ptr()),
+      reinterpret_cast<long*>(slot_mapping.data_ptr()),
+      reinterpret_cast<int const*>(block_tables.data_ptr()),
+      block_tables.stride(0));
+}
+
+}  // namespace prepare_inputs
+
+void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                  torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
+  prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
+                               sampled_token_ids, input_positions, seq_lens,
+                               slot_mapping, block_tables);
+}
\ No newline at end of file
diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh
new file mode 100644
index 0000000000000..f21574681b1ab
--- /dev/null
+++ b/csrc/prepare_inputs/advance_step.cuh
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+namespace prepare_inputs {
+
+static constexpr int max_threads = 256;
+static constexpr bool logging = false;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+}  // namespace prepare_inputs
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
old mode 100755
new mode 100644
index cb6694b3036e9..2c8d007d8719f
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -86,6 +86,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 36864) \
     f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 49408) \
     f(in_T, out_T, W_T, narrow, 60544) \
     f(in_T, out_T, W_T, narrow, 60672) \
     f(in_T, out_T, W_T, narrow, 64000) \
@@ -182,6 +183,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 36864, narrow) \
     f(in_T, out_T, W_T, 43264, narrow) \
     f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 49408, narrow) \
     f(in_T, out_T, W_T, 60544, narrow) \
     f(in_T, out_T, W_T, 60672, narrow) \
     f(in_T, out_T, W_T, 64000, narrow) \
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
index 23d0587bbdc5d..bf04bb400790f 100644
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@@ -17,3 +17,11 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 740b9fb64a754..6ce25c5ac897b 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel {
 };
 
 /*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch._scaled_mm.
-
-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
 template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue {
- private:
+struct ScaledEpilogueBase {
+ protected:
   using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
 
   using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
@@ -102,6 +90,32 @@ struct ScaledEpilogue {
 
   using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
       OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
       cutlass::multiplies, float, float,
@@ -134,6 +148,53 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
@@ -168,13 +229,13 @@ struct cutlass_2x_gemm {
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType = 
+  using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16, 
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16, 
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
       float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp, 
-      Arch, 
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
       TileShape, WarpShape, InstructionShape,
       EVTD,
       cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
@@ -250,12 +311,39 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_default {
   // This config is used in 2 cases,
   //  - M in (128, inf)
   //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -271,6 +359,7 @@ struct sm80_config_M64 {
   // This config is used in 2 cases,
   // - M in (32, 64]
   // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -284,6 +373,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M32 {
   // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
@@ -297,6 +387,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M16 {
   // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
@@ -331,47 +422,56 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass2xGemmM16 =
       typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
 
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]
-    return cutlass_gemm_caller<Cutlass2xGemmM16>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return cutlass_gemm_caller<Cutlass2xGemmM32>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return cutlass_gemm_caller<Cutlass2xGemmM64>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
     uint32_t const n = out.size(1);
     bool const small_n = n < 8192;
     if (small_n) {
-      return cutlass_gemm_caller<Cutlass2xGemmM128SmallN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     } else {
-      return cutlass_gemm_caller<Cutlass2xGemmM128BigN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     }
   } else {
     // M in (128, inf)
-    return cutlass_gemm_caller<Cutlass2xGemmDefault>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
 
-void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -380,78 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (out.dtype() == torch::kBFloat16) {
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                      ScaledEpilogue>(out, a, b, a_scales,
-                                                      b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
+
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index cfa8f80f7ea04..b3f5b62086609 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -59,6 +59,28 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleBDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, float>;
+
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
+      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
 /*
    This epilogue function defines a quantized GEMM operation similar to
    torch.scaled_mm_.
@@ -76,21 +98,13 @@ struct enable_sm90_or_later : Kernel {
    per row or column.
 */
 template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue {
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleBDescriptor =
-      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
-          EpilogueDescriptor, float>;
-
-  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
-      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::multiplies, float, float,
@@ -120,6 +134,54 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using BiasDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, ElementD>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      BiasDescriptor::Stages, typename EpilogueDescriptor::TileShape, ElementD,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<ElementD>, false>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleA_Args = typename ScaleA::Arguments;
+    using ScaleB_Args = typename ScaleB::Arguments;
+    using Bias_Args = typename Bias::Arguments;
+
+    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    Bias_Args bias_args{static_cast<ElementD*>(bias.data_ptr())};
+
+    return ArgumentType{a_args, {b_args}, bias_args};
+  }
+};
+
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -440,41 +502,56 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                             Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
+        c, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
 #endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index f4e582d780ad9..605166930ccc6 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -6,23 +6,27 @@
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -34,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    return CUDA_VERSION >= 12040;
+    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
+    // and are slower than torch.mm. Return false unconditionally in this case.
+    return false;
+
+    // Once the CUTLASS kernels have been optimized for Lovelace systems,
+    // use the following check:
+    // return CUDA_VERSION >= 12040;
   }
 #endif
 
@@ -43,7 +53,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales) {
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias) {
   int32_t major_capability;
   int32_t minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@@ -66,6 +77,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
 
   if (version_num >= 90) {
@@ -73,19 +89,19 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
     // Guard against compilation issues for sm90 kernels
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
 #else
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
 #endif
   } else if (version_num == 89) {
     // Ada Lovelace
-    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
   } else if (version_num >= 80) {
     // Ampere
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
   } else {
     // Turing
     TORCH_CHECK(version_num >= 75);
-    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
   }
 }
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 6120086d72df2..090f95d1bda71 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -7,6 +7,8 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 
+#include "../../reduction_utils.cuh"
+
 namespace vllm {
 
 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@@ -21,10 +23,16 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
 
 #define FP8_E4M3_MAX std::numeric_limits<c10::Float8_e4m3fn>::max()
 
-template <typename scalar_t>
+template <bool is_scale_inverted>
 __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(
-    const scalar_t val, const float inverted_scale) {
-  float x = static_cast<float>(val) * inverted_scale;
+    float const val, float const scale) {
+  float x = 0.0f;
+  if constexpr (is_scale_inverted) {
+    x = val * scale;
+  } else {
+    x = val / scale;
+  }
+
   float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
   return static_cast<c10::Float8_e4m3fn>(r);
 }
@@ -87,6 +95,70 @@ typedef struct __align__(4) {
 }
 float8x4_t;
 
+template <typename scalar_t>
+__device__ float thread_max_vec(scalar_t const* __restrict__ input,
+                                int64_t const num_elems, int const tid,
+                                int const step) {
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vectorized_in =
+      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+
+  int64_t const num_vec_elems = num_elems >> 2;
+  float absmax_val = 0.0f;
+
+#pragma unroll 4
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    absmax_val = max(absmax_val, fabs(in_vec.x));
+    absmax_val = max(absmax_val, fabs(in_vec.y));
+    absmax_val = max(absmax_val, fabs(in_vec.z));
+    absmax_val = max(absmax_val, fabs(in_vec.w));
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+    absmax_val = max(absmax_val, fabs(input[i]));
+  }
+
+  return absmax_val;
+}
+
+template <typename scalar_t, bool is_scale_inverted>
+__device__ void scaled_fp8_conversion_vec(c10::Float8_e4m3fn* __restrict__ out,
+                                          scalar_t const* __restrict__ input,
+                                          float const scale,
+                                          int64_t const num_elems,
+                                          int const tid, int const step) {
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vectorized_in =
+      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+
+  int64_t const num_vec_elems = num_elems >> 2;
+
+#pragma unroll 4
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    float8x4_t out_vec;
+
+    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.x), scale);
+    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.y), scale);
+    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.z), scale);
+    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.w), scale);
+    vectorized_out[i] = out_vec;
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+    out[i] = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(input[i]), scale);
+  }
+}
+
 template <typename scalar_t>
 __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
                                         const scalar_t* __restrict__ input,
@@ -97,38 +169,68 @@ __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
   // Invert the scale so that we can use multiplications to avoid expensive
   // division.
   const float inverted_scale = 1.0f / (*scale);
+  scaled_fp8_conversion_vec<scalar_t, true>(
+      out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
+}
 
-  // Vectorized input/output to better utilize memory bandwidth.
-  const vec4_t<scalar_t>* vectorized_in =
-      reinterpret_cast<const vec4_t<scalar_t>*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+template <typename scalar_t>
+__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
+    c10::Float8_e4m3fn* __restrict__ out, float* __restrict__ scale,
+    scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
+    const int hidden_size) {
+  float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f);
 
-  int num_vec_elems = num_elems >> 2;
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
 
-#pragma unroll 4
-  for (int i = tid; i < num_vec_elems; i += blockDim.x * gridDim.x) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
+  scalar_t const* __restrict__ token_input = &input[token_idx * hidden_size];
+  c10::Float8_e4m3fn* __restrict__ token_output = &out[token_idx * hidden_size];
 
-    out_vec.x = scaled_fp8_conversion(in_vec.x, inverted_scale);
-    out_vec.y = scaled_fp8_conversion(in_vec.y, inverted_scale);
-    out_vec.z = scaled_fp8_conversion(in_vec.z, inverted_scale);
-    out_vec.w = scaled_fp8_conversion(in_vec.w, inverted_scale);
-    vectorized_out[i] = out_vec;
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  float absmax_val = 0.0f;
+  if (can_vectorize) {
+    absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
+  } else {
+    for (int i = tid; i < hidden_size; i += blockDim.x) {
+      float const x = static_cast<float>(token_input[i]);
+      absmax_val = max(absmax_val, fabs(x));
+    }
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int i = num_vec_elems * 4 + tid; i < num_elems;
-       i += blockDim.x * gridDim.x) {
-    out[i] = scaled_fp8_conversion(input[i], inverted_scale);
+  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
+  __shared__ float token_scale;
+  if (tid == 0) {
+    if (scale_ub) {
+      token_scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      token_scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    token_scale = max(token_scale / FP8_E4M3_MAX, min_scaling_factor);
+    scale[token_idx] = token_scale;
+  }
+  __syncthreads();
+
+  // Note that we don't use inverted scales so we can match FBGemm impl.
+  if (can_vectorize) {
+    scaled_fp8_conversion_vec<scalar_t, false>(
+        token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
+  } else {
+    for (int i = tid; i < hidden_size; i += blockDim.x) {
+      token_output[i] = scaled_fp8_conversion<false>(
+          static_cast<float>(token_input[i]), token_scale);
+    }
   }
 }
 
 }  // namespace vllm
 
-void static_scaled_fp8_quant(torch::Tensor& out,    // [..., d]
-                             torch::Tensor& input,  // [..., d]
-                             torch::Tensor& scale)  // [1]
+void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
+                             torch::Tensor const& input,  // [..., d]
+                             torch::Tensor const& scale)  // [1]
 {
   int64_t num_tokens = input.numel() / input.size(-1);
   int64_t num_elems = input.numel();
@@ -144,9 +246,9 @@ void static_scaled_fp8_quant(torch::Tensor& out,    // [..., d]
       });
 }
 
-void dynamic_scaled_fp8_quant(torch::Tensor& out,    // [..., d]
-                              torch::Tensor& input,  // [..., d]
-                              torch::Tensor& scale)  // [1]
+void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
+                              torch::Tensor const& input,  // [..., d]
+                              torch::Tensor& scale)        // [1]
 {
   int64_t num_tokens = input.numel() / input.size(-1);
   int64_t num_elems = input.numel();
@@ -163,3 +265,28 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,    // [..., d]
             scale.data_ptr<float>(), num_elems);
       });
 }
+
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out,          // [..., d]
+    torch::Tensor const& input,  // [..., d]
+    torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 1024));
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] {
+        vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t>
+            <<<grid, block, 0, stream>>>(
+                out.data_ptr<c10::Float8_e4m3fn>(), scales.data_ptr<float>(),
+                input.data_ptr<scalar_t>(),
+                scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                hidden_size);
+      });
+}
diff --git a/csrc/quantization/fp8/fp8_marlin.cu b/csrc/quantization/fp8/fp8_marlin.cu
new file mode 100644
index 0000000000000..eef6dc6ebdf4a
--- /dev/null
+++ b/csrc/quantization/fp8/fp8_marlin.cu
@@ -0,0 +1,1305 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#include "../gptq_marlin/marlin.cuh"
+#include "../gptq_marlin/marlin_dtypes.cuh"
+
+using namespace marlin;
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace fp8_marlin {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {}
+
+}  // namespace fp8_marlin
+
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <typename scalar_t>
+__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
+                             const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Fast FP8ToFp16/FP8ToBf16: Efficiently dequantize 8bit fp8_e4m3 values to fp16
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  typename ScalarType<half>::FragB frag_b;
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const half2*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const half2*>(&Out2), bias_reg);
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_8bit<nv_bfloat16>(int q) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out2), bias_reg);
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+
+  constexpr int pack_factor = 32 / num_bits;
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // We scale a `half2` tile in row-major layout for column-wise quantization.
+  int s_sh_rd =
+      8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4<scalar_t>(frag_a[k % 2][i],
+                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    is_same_group[pipe] = false;
+    same_group_id[pipe] = 0;
+    return;
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+
+      int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+      int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+      int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+      frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
+      frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<scalar_t*>(&c)[j] =
+                  Dtype::float2num(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      ((scalar_t2*)sh)[idx] = res;
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (s_sh_wr_pred) {
+        cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+      }
+      cp_async_fence();
+
+      thread_block_reduce();
+
+      cp_async_wait<0>();
+      __syncthreads();
+      if (threadIdx.x / 32 < thread_n_blocks / 4) {
+        reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+        reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+          for (int j = 0; j < 4; j++) {
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                                  frag_s[j / 2][2 * (j % 2) + 0]);
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                                  frag_s[j / 2][2 * (j % 2) + 0]);
+
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                                  frag_s[j / 2][2 * (j % 2) + 1]);
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                                  frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+      }
+
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
+                    THREAD_K_BLOCKS, GROUP_BLOCKS, NUM_THREADS)                \
+    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
+      cudaFuncSetAttribute(                                                    \
+          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
+                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>, \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
+      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
+             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>      \
+          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
+              A_ptr, B_ptr, C_ptr, s_ptr, num_groups, prob_m, prob_n, prob_k,  \
+              locks);                                                          \
+    }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+typedef struct {
+  int max_m_blocks;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+
+};
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits,
+                          int group_size) {
+  int tb_n = th_config.thread_n;
+
+  // Get max scale groups per thread-block
+  // Fixed for channelwise
+  int tb_groups = 1;
+  int tb_scales = tb_groups * tb_n * 2;
+
+  return tb_scales * pipe_stages;
+}
+
+bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
+                         int prob_m, int prob_n, int prob_k, int num_bits,
+                         int scales_cache_size, int max_shared_mem) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+
+  int b_size = (tb_k * tb_n / pack_factor) * 4;
+
+  // Get A size
+  int m_blocks = div_ceil(prob_m, 16);
+  int tb_max_m = 16;
+
+  while (true) {
+    if (m_blocks >= max_m_blocks) {
+      tb_max_m *= max_m_blocks;
+      break;
+    }
+
+    max_m_blocks--;
+    if (max_m_blocks == 0) {
+      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
+    }
+  }
+
+  int a_size = (tb_max_m * tb_k) * 2;
+
+  float pipe_size = (a_size + b_size) * pipe_stages;
+
+  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
+
+  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+}
+
+bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  //  Determine cache for scales
+  int scales_cache_size = get_scales_cache_size(th_config, prob_m, prob_n,
+                                                prob_k, num_bits, group_size);
+
+  // Check that pipeline fits into cache
+  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                           num_bits, scales_cache_size, max_shared_mem)) {
+    return false;
+  }
+
+  return true;
+}
+
+exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                      int num_bits, int group_size,
+                                      int max_shared_mem) {
+  int max_m_blocks = 4;
+  while (max_m_blocks > 0) {
+    if (prob_m <= 16) {
+      for (auto th_config : small_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    } else {
+      for (auto th_config : large_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    }
+
+    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
+                     // usage
+  }
+
+  return exec_config_t{0, {-1, -1, -1}};
+}
+
+  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)
+
+template <typename scalar_t>
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, int prob_m,
+                     int prob_n, int prob_k, void* workspace, int num_bits,
+                     int num_groups, int group_size, int dev,
+                     cudaStream_t stream, int thread_k, int thread_n, int sms,
+                     int max_par) {
+  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int tot_m = prob_m;
+  int tot_m_blocks = div_ceil(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    exec_cfg =
+        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
+  } else {
+    // Auto config
+    exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits,
+                                       group_size, max_shared_mem);
+  }
+
+  TORCH_CHECK(
+      exec_cfg.max_m_blocks > 0 &&
+          is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m,
+                          prob_n, prob_k, num_bits, group_size, max_shared_mem),
+      "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
+      ", thread_k = ", exec_cfg.tb_cfg.thread_k,
+      ", thread_n = ", exec_cfg.tb_cfg.thread_n,
+      ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m,
+      ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+      ", group_size = ", group_size, ", max_shared_mem = ", max_shared_mem);
+
+  int num_threads = exec_cfg.tb_cfg.num_threads;
+  thread_k = exec_cfg.tb_cfg.thread_k;
+  thread_n = exec_cfg.tb_cfg.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  int blocks = sms;
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+
+  int group_blocks = -1;
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  int* locks = (int*)workspace;
+
+  // Main loop
+  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > exec_cfg.max_m_blocks) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
+      if (par > max_par) par = max_par;
+      prob_m = (16 * exec_cfg.max_m_blocks) * par;
+      i += exec_cfg.max_m_blocks * (par - 1);
+      thread_m_blocks = exec_cfg.max_m_blocks;
+    }
+
+    // Define kernel configurations
+    if (false) {
+    }
+    CALL_IF(8, 32, 2, 256)
+    CALL_IF(8, 16, 4, 256)
+    CALL_IF(8, 8, 8, 256)
+    CALL_IF(8, 8, 4, 128)
+    CALL_IF(8, 4, 8, 128)
+    else {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                             str(prob_n) + ", " + str(prob_k) + "]" +
+                             ", num_groups = " + str(num_groups) +
+                             ", group_size = " + str(group_size) +
+                             ", thread_m_blocks = " + str(thread_m_blocks) +
+                             ", thread_n_blocks = " + str(thread_n_blocks) +
+                             ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+}  // namespace fp8_marlin
+
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k) {
+  // Verify num_bits
+  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
+  int pack_factor = 32 / num_bits;
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_size = ", marlin::tile_size);
+  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
+  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not divisible by tile_size = ", marlin::tile_size);
+  int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  // Channelwise only for FP8
+  TORCH_CHECK(b_scales.size(0) == 1)
+  num_groups = b_scales.size(0);
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
+              ", is not divisible by min_thread_n = ", marlin::min_thread_n);
+  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    fp8_marlin::marlin_mm_f16i4<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        b_scales.data_ptr<at::Half>(), size_m, size_n, size_k,
+        workspace.data_ptr(), num_bits, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    fp8_marlin::marlin_mm_f16i4<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(), size_m,
+        size_n, size_k, workspace.data_ptr(), num_bits, num_groups, group_size,
+        dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par);
+  } else {
+    TORCH_CHECK(false, "fp8_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
new file mode 100644
index 0000000000000..c58216d8e00c5
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -0,0 +1,269 @@
+#include "marlin.cuh"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {}
+
+}  // namespace marlin
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                int64_t size_k, int64_t size_n,
+                                int64_t num_bits) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  int start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int tile_n_ints = tile_n_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_ints / 4;
+  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+    int first_n_packed = first_n / pack_factor;
+
+    int4* sh_ptr = sh + stage_size * pipe;
+
+    if (threadIdx.x < stage_size) {
+      int k_id = threadIdx.x / stage_n_threads;
+      int n_id = threadIdx.x % stage_n_threads;
+
+      int first_k = k_tile_id * tile_k_size;
+
+      cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                reinterpret_cast<int4 const*>(
+                    &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) +
+                                     first_n_packed + (n_id * 4)])));
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    int warp_id = threadIdx.x / 32;
+    int th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+    int cur_n_packed = cur_n / pack_factor;
+    int cur_n_pos = cur_n % pack_factor;
+
+    constexpr int sh_stride = tile_n_ints;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    // Undo interleaving
+    int cur_n_pos_unpacked;
+    if constexpr (num_bits == 4) {
+      constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    } else {
+      constexpr int undo_pack[4] = {0, 2, 1, 3};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    }
+
+    uint32_t vals[8];
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int cur_elem = tc_row + tc_offsets[i];
+
+      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                          sh_stride * cur_elem];
+
+      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+  #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+  #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+  #pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+  #pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+  #pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace marlin
+
+  #define CALL_IF(NUM_BITS)                                                   \
+    else if (num_bits == NUM_BITS) {                                          \
+      cudaFuncSetAttribute(                                                   \
+          marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+      marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
+          <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+              b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+    }
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
+                                int64_t size_n, int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK(b_q_weight.size(0) == size_k,
+              "b_q_weight.size(0) = ", b_q_weight.size(0),
+              " is not size_k = ", size_k);
+  TORCH_CHECK((size_n / pack_factor) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_n = ", size_n, ", pack_factor = ", pack_factor);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4)
+  CALL_IF(8)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+  }
+
+  return out;
+}
+
+#endif
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 0beb9de14c687..122c5c16b58ce 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -19,8 +19,8 @@
  * Adapted from https://github.com/IST-DASLab/marlin
  */
 
-#include "gptq_marlin.cuh"
-#include "gptq_marlin_dtypes.cuh"
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
   static_assert(std::is_same<scalar_t, half>::value ||          \
@@ -32,7 +32,7 @@ inline std::string str(T x) {
   return std::to_string(x);
 }
 
-namespace gptq_marlin {
+namespace marlin {
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
@@ -72,10 +72,11 @@ __global__ void Marlin(
 }  // namespace gptq_marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                               torch::Tensor& b_scales, torch::Tensor& g_idx,
-                               torch::Tensor& perm, torch::Tensor& workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full) {
+                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
+                               torch::Tensor& g_idx, torch::Tensor& perm,
+                               torch::Tensor& workspace, int64_t num_bits,
+                               int64_t size_m, int64_t size_n, int64_t size_k,
+                               bool is_k_full) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -264,6 +265,114 @@ dequant_8bit<nv_bfloat16>(int q) {
   return frag_b;
 }
 
+// Zero-point dequantizers
+
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit_zp(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_4bit_zp<half>(
+    int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  typename ScalarType<half>::FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_4bit_zp<nv_bfloat16>(int q) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  return frag_b;
+}
+
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit_zp(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_8bit_zp<half>(
+    int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  typename ScalarType<half>::FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_8bit_zp<nv_bfloat16>(int q) {
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+
+  return frag_b;
+}
+
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 template <typename scalar_t>
@@ -277,6 +386,17 @@ __device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
+template <typename scalar_t>
+__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
 // Same as above, but for act_order (each K is multiplied individually)
 template <typename scalar_t>
 __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
@@ -404,6 +524,7 @@ template <typename scalar_t,          // compute dtype, half or nv_float16
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
           const bool has_act_order,    // whether act_order is enabled
+          const bool has_zp,           // whether zero-points are enabled
           const int group_blocks = -1  // number of consecutive 16x16 blocks
                                        // with a separate quantization scale
           >
@@ -413,6 +534,8 @@ __global__ void Marlin(
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
     const int* __restrict__ g_idx,        // int32 group indices of shape k
     int num_groups,  // number of scale groups per output channel
     int prob_m,      // batch dimension m
@@ -437,6 +560,7 @@ __global__ void Marlin(
   using FragB = typename ScalarType<scalar_t>::FragB;
   using FragC = typename ScalarType<scalar_t>::FragC;
   using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
 
   constexpr int pack_factor = 32 / num_bits;
 
@@ -566,6 +690,13 @@ __global__ void Marlin(
   int tb_n_warps = thread_n_blocks / 4;
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
+  // Zero-points sizes/strides
+  int zp_gl_stride = (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
   // Global A read index of current thread.
   int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                 (threadIdx.x % a_gl_rd_delta_o);
@@ -605,6 +736,19 @@ __global__ void Marlin(
   int s_sh_wr = threadIdx.x;
   bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
 
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
@@ -616,6 +760,18 @@ __global__ void Marlin(
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) % 4;
 
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    zp_sh_rd = num_ints_per_thread * num_col_threads *
+                   ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+               num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+  }
+
   // Precompute which thread should not read memory in which iterations; this is
   // needed if there are more threads than required for a certain tilesize or
   // when the batchsize is not a multiple of 16.
@@ -664,14 +820,17 @@ __global__ void Marlin(
   int4* sh_a = sh;
   int4* sh_b = sh_a + (stages * a_sh_stage);
   int4* sh_g_idx = sh_b + (stages * b_sh_stage);
-  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];         // No act-order
-  FragS act_frag_s[2][4][4];  // For act-order
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
 
   // Zero accumulators.
   auto zero_accums = [&]() {
@@ -777,6 +936,28 @@ __global__ void Marlin(
             }
           }
         }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -784,6 +965,12 @@ __global__ void Marlin(
     cp_async_fence();
   };
 
+  auto fetch_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
   // Wait until the next thread tile has been loaded to shared memory.
   auto wait_for_stage = [&]() {
     // We only have `stages - 2` active fetches since we are double buffering
@@ -932,8 +1119,73 @@ __global__ void Marlin(
     }
   };
 
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    if constexpr (!has_zp) {
+      return;
+    }
+
+    int pipe = full_pipe % stages;
+
+    if constexpr (group_blocks == -1) {
+      for (int i = 0; i < num_ints_per_thread; i++) {
+        frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+      }
+
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      int4* sh_zp_stage =
+          sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                 (pipe / (group_blocks / thread_k_blocks)));
+      for (int i = 0; i < num_ints_per_thread; i++) {
+        frag_qzp[k % 2][i] =
+            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+      }
+    } else {
+      int warp_id = threadIdx.x / 32;
+      int n_warps = thread_n_blocks / 4;
+
+      int warp_row = warp_id / n_warps;
+
+      int cur_k = warp_row * 16;
+      cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+      int k_blocks = cur_k / 16;
+      int cur_group_id = k_blocks / group_blocks;
+
+      int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+      sh_zp_stage += cur_group_id * zp_sh_stride;
+
+      for (int i = 0; i < num_ints_per_thread; i++) {
+        frag_qzp[k % 2][i] =
+            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+      }
+    }
+  };
+
   // Execute the actual tensor core matmul of a sub-tile.
   auto matmul = [&](int k) {
+    if constexpr (has_zp) {
+      FragB frag_zp_0;
+      FragB frag_zp_1;
+      if constexpr (num_bits == 4) {
+        int zp_quant = frag_qzp[k % 2][0];
+        int zp_quant_shift = zp_quant >> 8;
+        frag_zp_0 = dequant_4bit_zp<scalar_t>(zp_quant);
+        frag_zp_1 = dequant_4bit_zp<scalar_t>(zp_quant_shift);
+
+      } else {
+        int zp_quant_0 = frag_qzp[k % 2][0];
+        int zp_quant_1 = frag_qzp[k % 2][1];
+        frag_zp_0 = dequant_8bit_zp<scalar_t>(zp_quant_0);
+        frag_zp_1 = dequant_8bit_zp<scalar_t>(zp_quant_1);
+      }
+
+      frag_zp[0] = frag_zp_0[0];
+      frag_zp[1] = frag_zp_0[1];
+      frag_zp[2] = frag_zp_1[0];
+      frag_zp[3] = frag_zp_1[1];
+    }
+
   // We have the m dimension as the inner loop in order to encourage overlapping
   // dequantization and matmul operations.
   #pragma unroll
@@ -944,16 +1196,32 @@ __global__ void Marlin(
         int b_quant = frag_b_quant[k % 2][0][j];
         int b_quant_shift = b_quant >> 8;
 
-        frag_b0 = dequant_4bit<scalar_t>(b_quant);
-        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
+        if constexpr (has_zp) {
+          frag_b0 = dequant_4bit_zp<scalar_t>(b_quant);
+          frag_b1 = dequant_4bit_zp<scalar_t>(b_quant_shift);
+
+        } else {
+          frag_b0 = dequant_4bit<scalar_t>(b_quant);
+          frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
+        }
 
       } else {
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
         int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
 
-        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
-        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
+        if constexpr (has_zp) {
+          frag_b0 = dequant_8bit_zp<scalar_t>(b_quant_0);
+          frag_b1 = dequant_8bit_zp<scalar_t>(b_quant_1);
+        } else {
+          frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
+          frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
+        }
+      }
+
+      // Apply zero-point to frag_b0
+      if constexpr (has_zp) {
+        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
       }
 
       // Apply scale to frag_b0
@@ -967,6 +1235,11 @@ __global__ void Marlin(
         }
       }
 
+      // Apply zero-point to frag_b1
+      if constexpr (has_zp) {
+        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      }
+
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
@@ -1189,6 +1462,12 @@ __global__ void Marlin(
         }
         fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
       }
+
+      if constexpr (has_zp && group_blocks == -1) {
+        if (i == 0) {
+          fetch_zp_to_shared();
+        }
+      }
       fetch_to_shared(i, i, i < slice_iters);
     }
 
@@ -1197,6 +1476,7 @@ __global__ void Marlin(
     init_same_group(0);
     fetch_to_registers(0, 0);
     fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
     a_gl_rd += a_gl_rd_delta_o * (stages - 1);
     slice_k_start_shared_fetch += tb_k * (stages - 1);
   };
@@ -1217,6 +1497,7 @@ __global__ void Marlin(
       for (int k = 0; k < b_sh_wr_iters; k++) {
         fetch_to_registers(k + 1, pipe % stages);
         fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
         if (k == b_sh_wr_iters - 2) {
           fetch_to_shared((pipe + stages - 1) % stages, pipe,
                           slice_iters >= stages);
@@ -1354,6 +1635,7 @@ __global__ void Marlin(
 
         } else {
           s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
         }
 
         start_pipes();
@@ -1363,22 +1645,24 @@ __global__ void Marlin(
 }
 
   #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
-                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
+                    THREAD_K_BLOCKS, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,      \
+                    NUM_THREADS)                                               \
     else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
              thread_n_blocks == THREAD_N_BLOCKS &&                             \
              thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
-             num_threads == NUM_THREADS) {                                     \
+             has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
       cudaFuncSetAttribute(                                                    \
           Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
                  THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
-                 GROUP_BLOCKS>,                                                \
+                 HAS_ZP, GROUP_BLOCKS>,                                        \
           cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
       Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
              THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
-             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
-          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
-          prob_k, locks);                                                      \
+             HAS_ZP, GROUP_BLOCKS>                                             \
+          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
+              A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups,       \
+              prob_m, prob_n, prob_k, locks);                                  \
     }
 
 typedef struct {
@@ -1548,39 +1832,61 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
   return exec_config_t{0, {-1, -1, -1}};
 }
 
-  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                                       \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+  #define GPTQ_CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+                                                                              \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+                                                                              \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+                                                                              \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+                                                                              \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
+
+  #define AWQ_CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+                                                                             \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+                                                                             \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+                                                                             \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
 
 template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
                      void* g_idx, void* perm, void* a_tmp, int prob_m,
                      int prob_n, int prob_k, void* workspace, int num_bits,
-                     bool has_act_order, bool is_k_full, int num_groups,
-                     int group_size, int dev, cudaStream_t stream, int thread_k,
-                     int thread_n, int sms, int max_par) {
+                     bool has_act_order, bool is_k_full, bool has_zp,
+                     int num_groups, int group_size, int dev,
+                     cudaStream_t stream, int thread_k, int thread_n, int sms,
+                     int max_par) {
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
@@ -1665,6 +1971,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   const int4* s_ptr = (const int4*)s;
+  const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
   int4* a_tmp_ptr = (int4*)a_tmp;
@@ -1701,28 +2008,33 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
-    // Define kernel configurations
     if (false) {
     }
-    CALL_IF(4, 32, 2, 256)
-    CALL_IF(4, 16, 4, 256)
-    CALL_IF(4, 8, 8, 256)
-    CALL_IF(4, 8, 4, 128)
-    CALL_IF(4, 4, 8, 128)
-    CALL_IF(8, 32, 2, 256)
-    CALL_IF(8, 16, 4, 256)
-    CALL_IF(8, 8, 8, 256)
-    CALL_IF(8, 8, 4, 128)
-    CALL_IF(8, 4, 8, 128)
+    GPTQ_CALL_IF(4, 16, 4, 256)
+    GPTQ_CALL_IF(4, 8, 8, 256)
+    GPTQ_CALL_IF(4, 8, 4, 128)
+    GPTQ_CALL_IF(4, 4, 8, 128)
+    GPTQ_CALL_IF(8, 16, 4, 256)
+    GPTQ_CALL_IF(8, 8, 8, 256)
+    GPTQ_CALL_IF(8, 8, 4, 128)
+    GPTQ_CALL_IF(8, 4, 8, 128)
+
+    AWQ_CALL_IF(4, 16, 4, 256)
+    AWQ_CALL_IF(4, 8, 8, 256)
+    AWQ_CALL_IF(4, 8, 4, 128)
+    AWQ_CALL_IF(4, 4, 8, 128)
+    AWQ_CALL_IF(8, 16, 4, 256)
+    AWQ_CALL_IF(8, 8, 8, 256)
+    AWQ_CALL_IF(8, 8, 4, 128)
+    AWQ_CALL_IF(8, 4, 8, 128)
     else {
-      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
-                             str(prob_n) + ", " + str(prob_k) + "]" +
-                             ", has_act_order = " + str(has_act_order) +
-                             ", num_groups = " + str(num_groups) +
-                             ", group_size = " + str(group_size) +
-                             ", thread_m_blocks = " + str(thread_m_blocks) +
-                             ", thread_n_blocks = " + str(thread_n_blocks) +
-                             ", thread_k_blocks = " + str(thread_k_blocks));
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                  ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                  ", num_groups = ", num_groups, ", group_size = ", group_size,
+                  ", thread_m_blocks = ", thread_m_blocks,
+                  ", thread_n_blocks = ", thread_n_blocks,
+                  ", thread_k_blocks = ", thread_k_blocks,
+                  ", num_bits = ", num_bits);
     }
 
     A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
@@ -1733,10 +2045,11 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
 }  // namespace gptq_marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                               torch::Tensor& b_scales, torch::Tensor& g_idx,
-                               torch::Tensor& perm, torch::Tensor& workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full) {
+                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
+                               torch::Tensor& g_idx, torch::Tensor& perm,
+                               torch::Tensor& workspace, int64_t num_bits,
+                               int64_t size_m, int64_t size_n, int64_t size_k,
+                               bool is_k_full, bool has_zp) {
   // Verify num_bits
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
@@ -1749,16 +2062,15 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
               ", size_k = ", size_k);
 
   // Verify B
-  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
-  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
+  TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_size = ", marlin::tile_size);
+  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
               "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
-              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
-  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
+              ", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
+  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
               "b_q_weight.size(1) = ", b_q_weight.size(1),
-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
-  int actual_size_n =
-      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
+              " is not divisible by tile_size = ", marlin::tile_size);
+  int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
   TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
               ", actual_size_n = ", actual_size_n);
 
@@ -1772,6 +2084,9 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
+  TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+  TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+
   TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
   TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
 
@@ -1805,8 +2120,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   int group_size = -1;
   bool has_act_order = g_idx.size(0) != 0;
 
-  int b_rank = b_scales.sizes().size();
-  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
   TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
               " is not size_n = ", size_n);
   num_groups = b_scales.size(0);
@@ -1832,34 +2147,44 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
     }
   }
 
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
+    TORCH_CHECK(b_zeros.size(0) == num_groups,
+                "b_zeros dim 0 = ", b_zeros.size(0),
+                " is not num_groups = ", num_groups);
+    TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
+                "b_zeros dim 1 = ", b_scales.size(1),
+                " is not size_n / pack_factor = ", size_n / pack_factor);
+  }
+
   // Verify workspace size
-  TORCH_CHECK(
-      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
-      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
-  int min_workspace_size =
-      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
+  TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
+              ", is not divisible by min_thread_n = ", marlin::min_thread_n);
+  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
   TORCH_CHECK(workspace.numel() >= min_workspace_size,
               "workspace.numel = ", workspace.numel(),
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
   if (a.scalar_type() == at::ScalarType::Half) {
-    gptq_marlin::marlin_mm_f16i4<half>(
+    marlin::marlin_mm_f16i4<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
-        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-        thread_n, sms, gptq_marlin::max_par);
+        b_scales.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, marlin::max_par);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
+    marlin::marlin_mm_f16i4<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
         c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
-        is_k_full, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        gptq_marlin::max_par);
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
+        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, marlin::max_par);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 4adc158eb14ea..c71b1bf573263 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -1,23 +1,16 @@
-#include "gptq_marlin.cuh"
-
-namespace gptq_marlin {
-
-static constexpr int repack_stages = 8;
-
-static constexpr int repack_threads = 256;
-
-static constexpr int tile_k_size = tile_size;
-static constexpr int tile_n_size = tile_k_size * 4;
+#include "marlin.cuh"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
+namespace marlin {
+
 template <int const num_threads, int const num_bits, bool const has_perm>
-__global__ void marlin_repack_kernel(
+__global__ void gptq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr,
     uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {}
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
@@ -29,8 +22,10 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 
 #else
 
+namespace marlin {
+
 template <int const num_threads, int const num_bits, bool const has_perm>
-__global__ void marlin_repack_kernel(
+__global__ void gptq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr,
     uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
@@ -259,28 +254,28 @@ __global__ void marlin_repack_kernel(
   }
 }
 
-}  // namespace gptq_marlin
-
-  #define CALL_IF(NUM_BITS, HAS_PERM)                                          \
-    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                   \
-      cudaFuncSetAttribute(                                                    \
-          gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,       \
-                                            NUM_BITS, HAS_PERM>,               \
-          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
-      gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
-                                        HAS_PERM>                              \
-          <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(   \
-              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);              \
+}  // namespace marlin
+
+  #define CALL_IF(NUM_BITS, HAS_PERM)                                         \
+    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+      cudaFuncSetAttribute(                                                   \
+          marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                            HAS_PERM>,                        \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+      marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                        HAS_PERM>                             \
+          <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
     }
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits) {
   // Verify compatibility with marlin tile of 16x64
-  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
-              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
-  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
-              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
 
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
@@ -308,10 +303,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
   auto options = torch::TensorOptions()
                      .dtype(b_q_weight.dtype())
                      .device(b_q_weight.device());
-  torch::Tensor out =
-      torch::empty({size_k / gptq_marlin::tile_size,
-                    size_n * gptq_marlin::tile_size / pack_factor},
-                   options);
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
 
   // Detect if there is act_order
   bool has_perm = perm.size(0) != 0;
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
similarity index 88%
rename from csrc/quantization/gptq_marlin/gptq_marlin.cuh
rename to csrc/quantization/gptq_marlin/marlin.cuh
index 42af44951efda..74ccbac57bd3c 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -9,7 +9,9 @@
 #include <cuda_runtime.h>
 #include <iostream>
 
-namespace gptq_marlin {
+namespace marlin {
+
+// Marlin params
 
 // 8 warps are a good choice since every SM has 4 schedulers and having more
 // than 1 warp per schedule allows some more latency hiding. At the same time,
@@ -25,6 +27,15 @@ static constexpr int min_thread_k = 64;
 static constexpr int tile_size = 16;
 static constexpr int max_par = 16;
 
+// Repack params
+static constexpr int repack_stages = 8;
+
+static constexpr int repack_threads = 256;
+
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+
+// Helpers
 template <typename T, int n>
 struct Vec {
   T elems[n];
@@ -73,4 +84,4 @@ __device__ inline void cp_async_wait() {
 
 #endif
 
-}  // namespace gptq_marlin
+}  // namespace marlin
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
similarity index 93%
rename from csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh
rename to csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index ca1b7099d6ec7..be06c09bee331 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -1,11 +1,11 @@
 
 #ifndef _data_types_cuh
 #define _data_types_cuh
-#include "gptq_marlin.cuh"
+#include "marlin.cuh"
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 
-namespace gptq_marlin {
+namespace marlin {
 
 template <typename scalar_t>
 class ScalarType {};
@@ -23,6 +23,7 @@ class ScalarType<half> {
   using FragB = Vec<half2, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<half2, 1>;
+  using FragZP = Vec<half2, 4>;
 
   static __device__ float inline num2float(const half x) {
     return __half2float(x);
@@ -51,6 +52,7 @@ class ScalarType<nv_bfloat16> {
   using FragB = Vec<nv_bfloat162, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<nv_bfloat162, 1>;
+  using FragZP = Vec<nv_bfloat162, 4>;
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   static __device__ float inline num2float(const nv_bfloat16 x) {
@@ -72,6 +74,6 @@ class ScalarType<nv_bfloat16> {
 #endif
 };
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 #endif
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index d124c0149912d..37339b84ae25b 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -30,7 +30,7 @@ inline std::string str(T x) {
   return std::to_string(x);
 }
 
-namespace marlin {
+namespace marlin_dense {
 
 constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
 
@@ -1040,7 +1040,7 @@ void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
   }
 }
 
-}  // namespace marlin
+}  // namespace marlin_dense
 
 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                           torch::Tensor& b_scales, torch::Tensor& workspace,
@@ -1054,24 +1054,25 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   TORCH_CHECK(size_k == a.size(1),
               "Shape mismatch: a.size(1) = " + str(a.size(1)) +
                   ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % marlin::tile_size == 0,
-              "size_k = " + str(size_k) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
-  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+  TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(marlin_dense::tile_size));
+  TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
               "Shape mismatch: b_q_weight.size(0) = " +
                   str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(marlin::tile_size));
+                  ", tile_size = " + str(marlin_dense::tile_size));
 
   // Verify N
   TORCH_CHECK(b_scales.size(1) == size_n,
               "b_scales.size(1) = " + str(b_scales.size(1)) +
                   ", size_n = " + str(size_n));
-  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
-              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
+  TORCH_CHECK(
+      b_q_weight.size(1) % marlin_dense::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(marlin_dense::tile_size));
 
-  int actual_size_n =
-      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
+  int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
+                      marlin_dense::pack_factor_4bit;
   TORCH_CHECK(
       size_n == actual_size_n,
       "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
@@ -1116,21 +1117,22 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
               "Unexpected groupsize = " + str(groupsize));
 
   // Verify workspace size
-  TORCH_CHECK(
-      size_n % marlin::min_thread_n == 0,
-      "size_n = " + str(size_n) +
-          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
-  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(marlin_dense::min_thread_n));
+  int min_workspace_size =
+      (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
   TORCH_CHECK(workspace.numel() >= min_workspace_size,
               "workspace.numel = " + str(workspace.numel()) +
                   " is below min_workspace_size = " + str(min_workspace_size));
 
   int dev = a.get_device();
-  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
-                      b_scales.data_ptr(), size_m, size_n, size_k,
-                      workspace.data_ptr(), groupsize, dev,
-                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
-                      sms, marlin::max_par);
+  marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                            b_scales.data_ptr(), size_m, size_n, size_k,
+                            workspace.data_ptr(), groupsize, dev,
+                            at::cuda::getCurrentCUDAStream(dev), thread_k,
+                            thread_n, sms, marlin_dense::max_par);
 
   return c;
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 227b69d79e863..0df9bdb75018f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
@@ -41,8 +41,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
@@ -72,6 +72,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
 
+  // prepare_inputs advance_step
+  ops.def("advance_step", &advance_step);
+  ops.impl("advance_step", torch::kCUDA, &advance_step);
+
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
@@ -137,12 +141,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gptq_marlin_repack", &gptq_marlin_repack);
   ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
 
+  // awq_marlin repack from AWQ.
+  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+
+  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
+  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization.
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -171,12 +183,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
 
-  // Compute FP8 quantized tensor and scaling factor.
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
   ops.def(
       "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
       "()");
   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
 
+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
+      "scale, Tensor? scale_ub) -> "
+      "()");
+  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
+           &dynamic_per_token_scaled_fp8_quant);
+
   // Aligning the number of tokens to be processed by each expert such
   // that it is divisible by the block size.
   ops.def(
@@ -219,7 +239,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
+      "                  float k_scale, float v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
 
   // Reshape the key and value tensors and cache them.
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ed569816200ee..9a5964ec65b99 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,8 +1,8 @@
-sphinx == 6.2.1
-sphinx-book-theme == 1.0.1
-sphinx-copybutton == 0.5.2
-myst-parser == 2.0.0
-sphinx-argparse
+sphinx==6.2.1
+sphinx-book-theme==1.0.1
+sphinx-copybutton==0.5.2
+myst-parser==2.0.0
+sphinx-argparse==0.4.0
 
 # packages to install to build the documentation
 pydantic
diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
new file mode 100644
index 0000000000000..7174431b10272
--- /dev/null
+++ b/docs/source/_templates/sections/header.html
@@ -0,0 +1,39 @@
+<style>
+  .notification-bar {
+    width: 100vw;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    font-size: 16px;
+    padding: 0 6px 0 6px;
+  }
+  .notification-bar p {
+    margin: 0;
+  }
+  .notification-bar a {
+    font-weight: bold;
+    text-decoration: none;
+  }
+
+  /* Light mode styles (default) */
+  .notification-bar {
+    background-color: #fff3cd;
+    color: #856404;
+  }
+  .notification-bar a {
+    color: #d97706;
+  }
+
+  /* Dark mode styles */
+  html[data-theme=dark] .notification-bar {
+    background-color: #333;
+    color: #ddd;
+  }
+  html[data-theme=dark] .notification-bar a {
+    color: #ffa500; /* Brighter color for visibility */
+  }
+</style>
+
+<div class="notification-bar">
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+</div>
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index cd8e8b0f513c4..0182c96a8dfbf 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -13,6 +13,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Databricks
 - DeepInfra
 - Dropbox
+- Google Cloud
 - Lambda Lab
 - NVIDIA
 - Replicate
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ca26dcec4bb5a..f4cec05663fcd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,8 +66,20 @@
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
+    'use_edit_page_button': True,
 }
 
+# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
+if READTHEDOCS_VERSION_TYPE == "tag":
+    # remove the warning banner if the version is a tagged release
+    header_file = os.path.join(os.path.dirname(__file__),
+                               "_templates/sections/header.html")
+    # The file might be removed already if the build is triggered multiple times
+    # (readthedocs build both HTML and PDF versions separately)
+    if os.path.exists(header_file):
+        os.remove(header_file)
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
@@ -95,6 +107,7 @@ def setup(app):
     'triton',
     "tqdm",
     "tensorizer",
+    "pynvml",
 ]
 
 for mock_target in autodoc_mock_imports:
diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
new file mode 100644
index 0000000000000..e0c773781115f
--- /dev/null
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -0,0 +1,20 @@
+.. _input_processing_pipeline:
+
+Input Processing Pipeline
+=========================
+
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
new file mode 100644
index 0000000000000..5d895837590ba
--- /dev/null
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,39 @@
+.. _input_processing:
+
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
+:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+
+Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   input_processing_pipeline
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
new file mode 100644
index 0000000000000..b726138f840a3
--- /dev/null
+++ b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
@@ -0,0 +1,17 @@
+.. _adding_multimodal_plugin:
+
+Adding a Multimodal Plugin
+==========================
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
+
+.. note::
+  This article is a work in progress.
+
+..
+  TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index a25eceecc276b..7cdbec2c9e3d4 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -1,3 +1,5 @@
+.. _multi_modality:
+
 Multi-Modality
 ==============
 
@@ -5,16 +7,21 @@ Multi-Modality
     
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
-which allows you to pass in multi-modal input alongside text and token prompts.
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.
+
+Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
+by following :ref:`this guide <adding_multimodal_plugin>`.
+
+Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+Guides
+++++++
 
-.. contents::
-   :local:
-   :backlinks: none
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_plugin
 
 Module Contents
 +++++++++++++++
@@ -24,9 +31,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
@@ -35,7 +40,15 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.MultiModalData
+.. autodata:: vllm.multimodal.BatchedTensors
+
+.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+    :members:
+    :show-inheritance:
+
+.. autodata:: vllm.multimodal.MultiModalDataDict
+
+.. autoclass:: vllm.multimodal.MultiModalInputs
     :members:
     :show-inheritance:
 
diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.rst
index 31c3d16a3c8eb..9adf82d43f3e0 100644
--- a/docs/source/dev/offline_inference/llm_inputs.rst
+++ b/docs/source/dev/offline_inference/llm_inputs.rst
@@ -1,7 +1,7 @@
 LLM Inputs
 ==========
 
-.. autodata:: vllm.inputs.PromptStrictInputs
+.. autodata:: vllm.inputs.PromptInputs
 
 .. autoclass:: vllm.inputs.TextPrompt
     :show-inheritance:
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 61fcd45a26347..61efad2013b2a 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -3,7 +3,7 @@
 Installation with ROCm
 ======================
 
-vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
+vLLM supports AMD GPUs with ROCm 6.1.
 
 Requirements
 ------------
@@ -11,7 +11,7 @@ Requirements
 * OS: Linux
 * Python: 3.8 -- 3.11
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.0 and ROCm 5.7
+* ROCm 6.1
 
 Installation options:
 
@@ -27,10 +27,10 @@ You can build and install vLLM from source.
 
 First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
 
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
+* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
 * `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
 * `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
@@ -39,24 +39,17 @@ It provides flexibility to customize the build of docker image using the followi
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
 
 
-To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:
 
 .. code-block:: console
 
-    $ docker build -f Dockerfile.rocm -t vllm-rocm .
+    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 
-To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
+To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
 
 .. code-block:: console
 
-    $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
-
-.. code-block:: console
-
-    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-       -f Dockerfile.rocm -t vllm-rocm . 
+    $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
 
 To run the above docker image ``vllm-rocm``, use the below command:
 
@@ -85,39 +78,24 @@ Option 2: Build from source
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
 - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
-- `Pytorch <https://pytorch.org/>`_
+- `PyTorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
-
-For rocm6.0:
-
-.. code-block:: console
-
-    $ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
-
-
-For rocm5.7:
-
-.. code-block:: console
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`.
 
-    $ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
 
 
 1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
 
 Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
 
-2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
+2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_
 
-Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
+Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
+Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
 .. note::
-    - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
-    - If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
-    - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
 3. Build vLLM.
@@ -126,12 +104,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
 
     $ cd vllm
     $ pip install -U -r requirements-rocm.txt
-    $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
+    $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 
 
 .. tip::
 
-    - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. 
-    - The ROCm version of pytorch, ideally, should match the ROCm driver version.
+    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+    - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
+    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index a9544e8a59a3d..1c97515dbecd9 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -20,7 +20,7 @@ Requirements
 
 * OS: Linux
 * Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 is required.
+* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 
 .. _cpu_backend_quick_start_dockerfile:
 
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index a22bba1478abb..2aa52e79888a3 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -19,17 +19,14 @@ If you have already taken care of the above issues, but the vLLM instance still
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
 - Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
 
-  .. warning::
-    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
-
 With more logging, hopefully you can find the root cause of the issue.
 
 If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
 
 Here are some common issues that can cause hangs:
 
-- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
-- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
+- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
 
 .. code-block:: python
 
@@ -41,7 +38,16 @@ Here are some common issues that can cause hangs:
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
-    assert value == dist.get_world_size()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("sanity check is successful!")
 
 .. tip::
 
@@ -55,4 +61,10 @@ Here are some common issues that can cause hangs:
     - is reachable from all nodes
     - is set before running the script.
 
+    If the script runs successfully, you should see the message ``sanity check is successful!``.
+
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
+
+.. warning::
+
+    After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index ba23e7468dcc1..fe041e03a1b6c 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -20,7 +20,7 @@ You can install vLLM using pip:
 .. code-block:: console
 
     $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.9 -y
+    $ conda create -n myenv python=3.10 -y
     $ conda activate myenv
 
     $ # Install vLLM with CUDA 12.1.
@@ -35,13 +35,26 @@ You can install vLLM using pip:
 
         $ # Install vLLM with CUDA 11.8.
         $ export VLLM_VERSION=0.4.0
-        $ export PYTHON_VERSION=39
+        $ export PYTHON_VERSION=310
         $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 
     In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
     Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
 
+.. note::
+
+    vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
+
+    .. code-block:: console
+
+        $ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
+        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
+        $ # You can also access a specific commit
+        $ # export VLLM_COMMIT=...
+        $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
+
+
 .. _build_from_source:
 
 Build from source
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
new file mode 100644
index 0000000000000..0d8e0b680ff0d
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -0,0 +1,95 @@
+.. _installation_openvino:
+
+Installation with OpenVINO
+==========================
+
+vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (``--enable-prefix-caching``)
+- Chunked prefill (``--enable-chunked-prefill``)
+
+**Table of contents**:
+
+- :ref:`Requirements <openvino_backend_requirements>`
+- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
+- :ref:`Build from source <install_openvino_backend_from_source>`
+- :ref:`Performance tips <openvino_backend_performance_tips>`
+- :ref:`Limitations <openvino_backend_limitations>`
+
+.. _openvino_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Instruction set architecture (ISA) requirement: at least AVX2.
+
+.. _openvino_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+    $ docker run -it --rm vllm-openvino-env
+
+.. _install_openvino_backend_from_source:
+
+Install from source
+-------------------
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  .. code-block:: console
+
+      $ sudo apt-get update  -y
+      $ sudo apt-get install python3
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  .. code-block:: console
+
+      $ pip install --upgrade pip
+      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+- Finally, install vLLM with OpenVINO backend: 
+
+  .. code-block:: console
+
+      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+
+.. _openvino_backend_performance_tips:
+
+Performance tips
+----------------
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
+
+OpenVINO best known configuration is:
+
+.. code-block:: console
+
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+
+.. _openvino_backend_limitations:
+
+Limitations
+-----------
+
+- LoRA serving is not supported.
+
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+
+- Speculative sampling is not tested within vLLM integration.
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 7c44a96865a50..89bdc247c5e8e 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -73,16 +73,13 @@ Start the server:
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-125m
+    $ vllm serve facebook/opt-125m
 
 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
 
 .. code-block:: console
 
-   $ python -m vllm.entrypoints.openai.api_server \
-   $     --model facebook/opt-125m \
-   $     --chat-template ./examples/template_chatml.jinja
+    $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
 
 This server can be queried in the same format as OpenAI API. For example, list the models:
 
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 3627600e1f23a..5e2f514a4a509 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,7 +56,7 @@ First, install the dependencies:
     $ pip uninstall torch torch-xla -y
 
     $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="+20240601"
+    $ export DATE="+20240713"
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
 
@@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds:
 .. code-block:: console
 
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+
+
+.. tip::
+
+    If you encounter the following error:
+
+    .. code-block:: console
+
+        from torch._C import *  # noqa: F403
+        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+
+
+    Please install OpenBLAS with the following command:
+
+    .. code-block:: console
+
+        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
index 4f0d2da25b8e8..a0118e20c49db 100644
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -40,12 +40,13 @@ Quick start using Dockerfile
 Build from source
 -----------------
 
-- First, install required driver and intel OneAPI 2024.1.
+- First, install required driver and intel OneAPI 2024.1 or later.
 
 - Second, install Python packages for vLLM XPU backend building:
 
 .. code-block:: console
 
+    $ source /opt/intel/oneapi/setvars.sh
     $ pip install --upgrade pip
     $ pip install -v -r requirements-xpu.txt 
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8795a865c3db6..2691805ed97a4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
 
 * Seamless integration with popular HuggingFace models
 * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism support for distributed inference
+* Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
@@ -63,6 +63,7 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
+   getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
@@ -83,6 +84,7 @@ Documentation
    serving/usage_stats
    serving/integrations
    serving/tensorizer
+   serving/faq
 
 .. toctree::
    :maxdepth: 1
@@ -90,6 +92,7 @@ Documentation
 
    models/supported_models
    models/adding_model
+   models/enabling_multimodal_inputs
    models/engine_args
    models/lora
    models/vlm
@@ -100,6 +103,7 @@ Documentation
    :maxdepth: 1
    :caption: Quantization
 
+   quantization/supported_hardware
    quantization/auto_awq
    quantization/fp8
    quantization/fp8_e5m2_kvcache
@@ -113,12 +117,14 @@ Documentation
    automatic_prefix_caching/details
 
 .. toctree::
+   :maxdepth: 2
    :caption: Developer Documentation
 
    dev/sampling_params
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index cbc8099e6f70f..5cffb58cafd96 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -10,6 +10,10 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
 
+.. note::
+    By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
+    please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
+
 .. tip::
     If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
     We will be happy to help you out!
@@ -37,30 +41,30 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
 
 .. code-block:: diff
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-    -    attention_mask: Optional[torch.Tensor] = None,
-    -    position_ids: Optional[torch.LongTensor] = None,
-    -    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    -    inputs_embeds: Optional[torch.FloatTensor] = None,
-    -    labels: Optional[torch.LongTensor] = None,
-    -    use_cache: Optional[bool] = None,
-    -    output_attentions: Optional[bool] = None,
-    -    output_hidden_states: Optional[bool] = None,
-    -    return_dict: Optional[bool] = None,
-    -) -> Union[Tuple, CausalLMOutputWithPast]:
-    +    positions: torch.Tensor,
-    +    kv_caches: List[torch.Tensor],
-    +    attn_metadata: AttentionMetadata,
-    +) -> Optional[SamplerOutput]:
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+    -     attention_mask: Optional[torch.Tensor] = None,
+    -     position_ids: Optional[torch.LongTensor] = None,
+    -     past_key_values: Optional[List[torch.FloatTensor]] = None,
+    -     inputs_embeds: Optional[torch.FloatTensor] = None,
+    -     labels: Optional[torch.LongTensor] = None,
+    -     use_cache: Optional[bool] = None,
+    -     output_attentions: Optional[bool] = None,
+    -     output_hidden_states: Optional[bool] = None,
+    -     return_dict: Optional[bool] = None,
+    - ) -> Union[Tuple, CausalLMOutputWithPast]:
+    +     positions: torch.Tensor,
+    +     kv_caches: List[torch.Tensor],
+    +     attn_metadata: AttentionMetadata,
+    + ) -> Optional[SamplerOutput]:
 
 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
 2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
@@ -75,7 +79,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
@@ -110,7 +114,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
+If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
 
 .. code-block:: python
 
@@ -120,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
     import runpy
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
-Save the above code in a file and run it with `python your_file.py args`.
+Save the above code in a file and run it with :code:`python your_file.py <args>`.
diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
new file mode 100644
index 0000000000000..20be920b5f699
--- /dev/null
+++ b/docs/source/models/enabling_multimodal_inputs.rst
@@ -0,0 +1,147 @@
+.. _enabling_multimodal_inputs:
+
+Enabling Multimodal Inputs
+==========================
+
+This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
+
+.. seealso::
+    :ref:`adding_a_new_model`
+
+
+1. Update the base vLLM model
+-----------------------------
+
+It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
+Further update the model as follows:
+
+- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
+
+  .. code-block:: diff
+
+      + from vllm.model_executor.models.interfaces import SupportsVision
+
+      - class YourModelForImage2Seq(nn.Module):
+      + class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+  .. note::
+      The model class does not have to be named :code:`*ForCausalLM`.
+      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  .. code-block:: diff
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+      +     pixel_values: torch.Tensor,
+        ) -> SamplerOutput:
+
+
+2. Register input mappers
+-------------------------
+
+For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
+
+.. code-block:: diff
+
+      from vllm.model_executor.models.interfaces import SupportsVision
+    + from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    + @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+3. Register maximum number of multi-modal tokens
+------------------------------------------------
+
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens
+and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+
+.. code-block:: diff
+
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
+
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+    + @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+4. (Optional) Register dummy data
+---------------------------------
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+.. code-block:: diff
+
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
+
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+.. note::
+    The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+5. (Optional) Register input processor
+--------------------------------------
+
+Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call.
+You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+.. code-block:: diff
+
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
+
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
+
+- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index bdf566d3ebbd1..e7ce8cdcabe88 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
 
 Async Engine Arguments
@@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
\ No newline at end of file
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 2278640481a91..f08773fe59d92 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -4,6 +4,9 @@ Using LoRA adapters
 ===================
 
 This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+
 Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
@@ -58,10 +61,12 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model meta-llama/Llama-2-7b-hf \
+    vllm serve meta-llama/Llama-2-7b-hf \
         --enable-lora \
-        --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
+        --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+
+.. note::
+   The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
 
 The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
 etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index 9fb62397b9aaf..87a52360c0841 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -73,5 +73,5 @@ Resources for vLLM contributors
 -------------------------------
 * `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
 * `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
-* `Information on batch expansion. <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
+* `Information on batch expansion <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
 * `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f4673dc27092f..068c00da39cd9 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
 The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.
 
+Decoder-only Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
   :widths: 25 25 50 5
   :header-rows: 1
@@ -55,6 +57,10 @@ Alongside each architecture, we include some popular models that use it.
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
     - ✅︎
+  * - :code:`Gemma2ForCausalLM`
+    - Gemma2
+    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
@@ -83,18 +89,14 @@ Alongside each architecture, we include some popular models that use it.
     - Jais
     - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
     -
+  * - :code:`JambaForCausalLM`
+    - Jamba
+    - :code:`ai21labs/Jamba-v0.1`, etc.
+    - ✅︎
   * - :code:`LlamaForCausalLM`
-    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
-    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
-    -
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
@@ -129,16 +131,16 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`Phi3ForCausalLM`
     - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
     -
   * - :code:`Phi3SmallForCausalLM`
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision
-    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
-    -
+  * - :code:`PersimmonForCausalLM`
+    - Persimmon
+    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
+    - 
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
@@ -164,14 +166,52 @@ Alongside each architecture, we include some popular models that use it.
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     -
 
+.. note::
+    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+
+.. _supported_vlms:
+
+Vision Language Models
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+  :widths: 25 25 50 5
+  :header-rows: 1
+
+  * - Architecture
+    - Models
+    - Example HuggingFace Models
+    - :ref:`LoRA <lora>`
+  * - :code:`ChameleonForConditionalGeneration`
+    - Chameleon
+    - :code:`facebook/chameleon-7b` etc.
+    - 
+  * - :code:`FuyuForCausalLM`
+    - Fuyu
+    - :code:`adept/fuyu-8b` etc.
+    - 
+  * - :code:`LlavaForConditionalGeneration`
+    - LLaVA-1.5
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    -
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT
+    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+    -
+  * - :code:`PaliGemmaForConditionalGeneration`
+    - PaliGemma
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
+    - 
+  * - :code:`Phi3VForCausalLM`
+    - Phi-3-Vision
+    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
+Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
+for instructions on how to implement support for your model.
 Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
 
-.. note::
-    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-
 .. tip::
     The easiest way to check if your model is supported is to run the program below:
 
@@ -202,8 +242,9 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
         output = llm.generate("Hello, my name is")
         print(output)
 
+
 Model Support Policy
----------------------
+=====================
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 70ac82e2005b9..ef4ce0d44a162 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -3,24 +3,15 @@
 Using VLMs
 ==========
 
-vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
-
-Engine Arguments
-----------------
-
-The following :ref:`engine arguments <engine_args>` are specific to VLMs:
-
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _vlm_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
-    :nodefaultconst:
+vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here <supported_vlms>`.
+This document shows you how to run and serve these models using vLLM.
 
 .. important::
+    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
+
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.
 
     We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
@@ -31,38 +22,60 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 .. code-block:: python
 
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        image_feature_size=576,
-    )
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+.. important::
+    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    internally for each model.
 
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
-* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
+
+* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
 
 .. code-block:: python
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
-    image = ...
-
+    image = PIL.Image.open(...)
+    
+    # Single prompt inference
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {"image": image},
     })
 
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+    
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
+
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
 
@@ -81,13 +94,12 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
-        --image-token-id 32000 \
-        --image-input-shape 1,3,336,336 \
-        --image-feature-size 576 \
-        --chat-template template_llava.jinja
+    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+
+.. important::
+    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    internally for each model.
 
 To consume the server, you can use the OpenAI client like in the example below:
 
@@ -105,6 +117,8 @@ To consume the server, you can use the OpenAI client like in the example below:
         messages=[{
             "role": "user",
             "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What's in this image?"},
                 {
                     "type": "image_url",
@@ -117,6 +131,8 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat response:", chat_response)
 
+A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
+
 .. note::
 
     By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
@@ -126,5 +142,4 @@ To consume the server, you can use the OpenAI client like in the example below:
         export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 .. note::
-    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
-    processed automatically by the server.
+    There is no need to format the prompt in the API request since it will be handled by the server.
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index 312a564595cc8..7f796fc3ab458 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -3,7 +3,10 @@
 FP8
 ==================
 
-vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
+Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. 
+Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
 
 Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
 
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
new file mode 100644
index 0000000000000..ecc330d866dbd
--- /dev/null
+++ b/docs/source/quantization/supported_hardware.rst
@@ -0,0 +1,30 @@
+.. _supported_hardware_for_quantization:
+
+Supported Hardware for Quantization Kernels
+===========================================
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8             ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+bitsandbytes    ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+
+Notes:
+^^^^^^
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅" indicates that the quantization method is supported on the specified hardware.
+- "❌" indicates that the quantization method is not supported on the specified hardware.
+
+Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
\ No newline at end of file
diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst
index ff0ac911108c4..9585b6ef5cb38 100644
--- a/docs/source/serving/deploying_with_cerebrium.rst
+++ b/docs/source/serving/deploying_with_cerebrium.rst
@@ -28,6 +28,9 @@ Next, to install the required packages, add the following to your cerebrium.toml
 
 .. code-block:: toml
 
+    [cerebrium.deployment]
+    docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+
     [cerebrium.dependencies.pip]
     vllm = "latest"
 
diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst
index baf87314ca8e4..e1eb45b225d9c 100644
--- a/docs/source/serving/deploying_with_dstack.rst
+++ b/docs/source/serving/deploying_with_dstack.rst
@@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
         gpu: 24GB
     commands:
         - pip install vllm
-        - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000
+        - vllm serve $MODEL --port 8000
     model:
         format: openai
         type: chat
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index b0c45dbf70268..4fc36a680084c 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -3,7 +3,26 @@
 Distributed Inference and Serving
 =================================
 
-vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+How to decide the distributed inference strategy?
+-------------------------------------------------
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+.. note::
+    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+
+Details for Distributed Inference and Serving
+----------------------------------------------
+
+vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_.  We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
@@ -19,20 +38,73 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.api_server \
-    $     --model facebook/opt-13b \
+    $ vllm serve facebook/opt-13b \
     $     --tensor-parallel-size 4
 
-To scale vLLM beyond a single machine, install and start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
+You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+
+.. code-block:: console
+
+    $ vllm serve gpt2 \
+    $     --tensor-parallel-size 4 \
+    $     --pipeline-parallel-size 2
+
+.. note::
+    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models.
+
+Multi-Node Inference and Serving
+--------------------------------
+
+If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
+
+The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster.
+
+Pick a node as the head node, and run the following command:
 
 .. code-block:: console
 
-    $ pip install ray
+    $ bash run_cluster.sh \
+    $                   vllm/vllm-openai \
+    $                   ip_of_head_node \
+    $                   --head \
+    $                   /path/to/the/huggingface/home/in/this/node
+
+On the rest of the worker nodes, run the following command:
+
+.. code-block:: console
+
+    $ bash run_cluster.sh \
+    $                   vllm/vllm-openai \
+    $                   ip_of_head_node \
+    $                   --worker \
+    $                   /path/to/the/huggingface/home/in/this/node
+
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster.
+
+Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+
+.. code-block:: console
+
+    $ vllm serve /path/to/the/model/in/the/container \
+    $     --tensor-parallel-size 8 \
+    $     --pipeline-parallel-size 2
+
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+
+.. code-block:: console
+
+    $ vllm serve /path/to/the/model/in/the/container \
+    $     --tensor-parallel-size 16
+
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
+
+.. warning::
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information.
 
-    $ # On head node
-    $ ray start --head
+.. warning::
 
-    $ # On worker nodes
-    $ ray start --address=<ray-head-address>
+    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 
-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
\ No newline at end of file
+    When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model.
diff --git a/docs/source/serving/env_vars.rst b/docs/source/serving/env_vars.rst
index 0ce1374a3967b..ff2259c0da3f1 100644
--- a/docs/source/serving/env_vars.rst
+++ b/docs/source/serving/env_vars.rst
@@ -3,6 +3,11 @@ Environment Variables
 
 vLLM uses the following environment variables to configure the system:
 
+.. warning::
+    Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
+
+    All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
+
 .. literalinclude:: ../../../vllm/envs.py
     :language: python
     :start-after: begin-env-vars-definition
diff --git a/docs/source/serving/faq.rst b/docs/source/serving/faq.rst
new file mode 100644
index 0000000000000..7b0374be8adff
--- /dev/null
+++ b/docs/source/serving/faq.rst
@@ -0,0 +1,12 @@
+Frequently Asked Questions
+===========================
+
+    Q: How can I serve multiple models on a single port using the OpenAI API?
+
+A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
+
+----------------------------------------
+
+    Q: Which model to use for offline inference embedding?
+
+A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 6248d84683753..a06c30d9c48c6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
 
 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
 and all chat requests will error.
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model ... \
-  --chat-template ./path-to-chat-template.jinja
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
 vLLM community provides a set of chat templates for popular models. You can find them in the examples
@@ -109,8 +107,8 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
 ```{argparse}
 :module: vllm.entrypoints.openai.cli_args
-:func: make_arg_parser
-:prog: -m vllm.entrypoints.openai.api_server
+:func: create_parser_for_docs
+:prog: vllm serve
 ```
 
 ## Tool calling in the chat completion API
diff --git a/examples/api_client.py b/examples/api_client.py
index 70ec8c5492124..27a2a08b7b0c3 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -1,4 +1,8 @@
-"""Example Python client for vllm.entrypoints.api_server"""
+"""Example Python client for `vllm.entrypoints.api_server`
+NOTE: The API server is used only for demonstration and simple performance
+benchmarks. It is not intended for production use.
+For production use, we recommend `vllm serve` and the OpenAI client API.
+"""
 
 import argparse
 import json
diff --git a/examples/cpu_offload.py b/examples/cpu_offload.py
new file mode 100644
index 0000000000000..b152e5bc37e6d
--- /dev/null
+++ b/examples/cpu_offload.py
@@ -0,0 +1,22 @@
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
new file mode 100644
index 0000000000000..c92b8fb4bc286
--- /dev/null
+++ b/examples/fuyu_example.py
@@ -0,0 +1,31 @@
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+
+
+def run_fuyu():
+    llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
+
+    # single-image prompt
+    prompt = "What is the highest life expectancy at of male?\n"
+    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
+    image = Image.open(requests.get(url, stream=True).raw)
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        },
+        sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    run_fuyu()
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 980d7bf9f8a3c..4c9eabd261e5c 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -1,38 +1,19 @@
-import argparse
-import os
-import subprocess
-
-import torch
-from PIL import Image
-
 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
-
-# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
-# You can use `.buildkite/download-images.sh` to download them
+from vllm.assets.image import ImageAsset
 
 
-def run_llava_pixel_values(*, disable_image_processor: bool = False):
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        image_feature_size=576,
-        disable_image_processor=disable_image_processor,
-    )
+def run_llava():
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = ImageAsset("stop_sign").pil_image
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {
+            "image": image
+        },
     })
 
     for o in outputs:
@@ -40,59 +21,5 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         print(generated_text)
 
 
-def run_llava_image_features():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="image_features",
-        image_token_id=32000,
-        image_input_shape="1,576,1024",
-        image_feature_size=576,
-    )
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_llava_pixel_values()
-    else:
-        run_llava_image_features()
-
-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on Llava")
-    parser.add_argument("--type",
-                        type=str,
-                        choices=["pixel_values", "image_features"],
-                        default="pixel_values",
-                        help="image input type")
-    args = parser.parse_args()
-    # Download from s3
-    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
-    local_directory = "images"
-
-    # Make sure the local directory exists or create it
-    os.makedirs(local_directory, exist_ok=True)
-
-    # Use AWS CLI to sync the directory, assume anonymous access
-    subprocess.check_call([
-        "aws",
-        "s3",
-        "sync",
-        s3_bucket_path,
-        local_directory,
-        "--no-sign-request",
-    ])
-    main(args)
+    run_llava()
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
new file mode 100644
index 0000000000000..fd53a6def1a13
--- /dev/null
+++ b/examples/llava_next_example.py
@@ -0,0 +1,36 @@
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+
+
+def run_llava_next():
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
+
+    prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
+    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+    image = Image.open(BytesIO(requests.get(url).content))
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=100)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            }
+        },
+        sampling_params=sampling_params)
+
+    generated_text = ""
+    for o in outputs:
+        generated_text += o.outputs[0].text
+
+    print(f"LLM output:{generated_text}")
+
+
+if __name__ == "__main__":
+    run_llava_next()
diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md
index 75b4b31a80462..0d278b0392403 100644
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -167,9 +163,7 @@ loggers.
 
 ```bash
 VLLM_CONFIGURE_LOGGING=0 \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
index 5448ec1f6208c..5dec4a76afb2f 100644
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
@@ -52,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
         speculative_model="ibm-fms/llama-13b-accelerator",
         # These are currently required for MLPSpeculator decoding
         use_v2_block_manager=True,
-        enforce_eager=True,
     )
 
     print("With speculation")
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
old mode 100755
new mode 100644
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference_tpu.py
new file mode 100644
index 0000000000000..251629b8027ce
--- /dev/null
+++ b/examples/offline_inference_tpu.py
@@ -0,0 +1,28 @@
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "A robot may not injure a human being",
+    "It is only with the heart that one can see rightly;",
+    "The greatest glory in living lies not in never falling,",
+]
+answers = [
+    " or, through inaction, allow a human being to come to harm.",
+    " what is essential is invisible to the eye.",
+    " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0.7,
+                                 top_p=1.0,
+                                 n=N,
+                                 max_tokens=16)
+
+# Set `enforce_eager=True` to avoid ahead-of-time compilation.
+# In real workloads, `enforace_eager` should be `False`.
+llm = LLM(model="google/gemma-2b", enforce_eager=True)
+outputs = llm.generate(prompts, sampling_params)
+for output, answer in zip(outputs, answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert generated_text.startswith(answer)
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
new file mode 100644
index 0000000000000..2082c378e267c
--- /dev/null
+++ b/examples/openai_vision_api_client.py
@@ -0,0 +1,84 @@
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+# Use image url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+
+# Use base64 encoded image in the payload
+def encode_image_base64_from_url(image_url: str) -> str:
+    """Encode an image retrieved from a remote url to base64 format."""
+
+    with requests.get(image_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+image_base64 = encode_image_base64_from_url(image_url=image_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")
diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py
new file mode 100644
index 0000000000000..92a3cb3ac4129
--- /dev/null
+++ b/examples/paligemma_example.py
@@ -0,0 +1,25 @@
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+
+def run_paligemma():
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    prompt = "caption es"
+
+    image = ImageAsset("stop_sign").pil_image
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    run_paligemma()
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 4f37c47ddca87..ae8c38d84e8fd 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -1,36 +1,34 @@
-import os
-import subprocess
-
-from PIL import Image
-
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
+from vllm.assets.image import ImageAsset
 
 
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        image_input_type="pixel_values",
-        image_token_id=32044,
-        image_input_shape="1,3,1008,1344",
-        image_feature_size=1921,
-        disable_image_processor=False,
+        max_num_seqs=5,
     )
 
-    image = Image.open("images/cherry_blossom.jpg")
+    image = ImageAsset("cherry_blossom").pil_image
 
     # single-image prompt
     prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
-
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
     outputs = llm.generate(
         {
             "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
         },
         sampling_params=sampling_params)
     for o in outputs:
@@ -39,19 +37,4 @@ def run_phi3v():
 
 
 if __name__ == "__main__":
-    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
-    local_directory = "images"
-
-    # Make sure the local directory exists or create it
-    os.makedirs(local_directory, exist_ok=True)
-
-    # Use AWS CLI to sync the directory, assume anonymous access
-    subprocess.check_call([
-        "aws",
-        "s3",
-        "sync",
-        s3_bucket_path,
-        local_directory,
-        "--no-sign-request",
-    ])
     run_phi3v()
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
index 1449442273c7a..2c7a7caa1bd7c 100644
--- a/examples/production_monitoring/Otel.md
+++ b/examples/production_monitoring/Otel.md
@@ -36,7 +36,7 @@
     ```
     export OTEL_SERVICE_NAME="vllm-server"
     export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
-    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
     ```
 
 1. In a new shell, send requests with trace context from a dummy client
@@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
 ```
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
-python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
 ```
 
 ## Instrumentation of FastAPI
@@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
 
 1. Run vLLM with `opentelemetry-instrument`
     ```
-    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    opentelemetry-instrument vllm serve facebook/opt-125m
     ```
 
 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md
index 268f2e771018f..807c0470e7b30 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
@@ -10,8 +10,7 @@ Install:
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
-python3 -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-v0.1 \
+vllm serve mistralai/Mistral-7B-v0.1 \
     --max-model-len 2048 \
     --disable-log-requests
 ```
diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json
index 273f7f5ac42cf..d1389f5392c8c 100644
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
@@ -1,13 +1,5 @@
 {
   "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
   ],
   "__elements": {},
   "__requires": [
@@ -1215,11 +1207,21 @@
   "templating": {
     "list": [
       {
+        "type": "datasource",
+        "name": "DS_PROMETHEUS",
+        "label": "datasource",
         "current": {},
-        "datasource": {
-          "type": "prometheus",
-          "uid": "${DS_PROMETHEUS}"
-        },
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false
+      },
+      {
         "definition": "label_values(model_name)",
         "hide": 0,
         "includeAll": false,
@@ -1250,3 +1252,4 @@
   "version": 1,
   "weekStart": ""
 }
+
diff --git a/examples/run_cluster.sh b/examples/run_cluster.sh
new file mode 100644
index 0000000000000..8e4aa59e1766d
--- /dev/null
+++ b/examples/run_cluster.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Check for minimum number of required arguments
+if [ $# -lt 4 ]; then
+    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
+    exit 1
+fi
+
+# Assign the first three arguments and shift them away
+DOCKER_IMAGE="$1"
+HEAD_NODE_ADDRESS="$2"
+NODE_TYPE="$3"  # Should be --head or --worker
+PATH_TO_HF_HOME="$4"
+shift 4
+
+# Additional arguments are passed directly to the Docker command
+ADDITIONAL_ARGS="$@"
+
+# Validate node type
+if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
+    echo "Error: Node type must be --head or --worker"
+    exit 1
+fi
+
+# Define a function to cleanup on EXIT signal
+cleanup() {
+    docker stop node
+    docker rm node
+}
+trap cleanup EXIT
+
+# Command setup for head or worker node
+RAY_START_CMD="ray start --block"
+if [ "${NODE_TYPE}" == "--head" ]; then
+    RAY_START_CMD+=" --head --port=6379"
+else
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
+fi
+
+# Run the docker command with the user specified parameters and additional arguments
+docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name node \
+    --shm-size 10.24g \
+    --gpus all \
+    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    ${ADDITIONAL_ARGS} \
+    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
diff --git a/format.sh b/format.sh
index 8c54b56302d5b..5ad6d6f2938bb 100755
--- a/format.sh
+++ b/format.sh
@@ -96,22 +96,23 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
+mypy tests --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
 mypy vllm/attention --config-file pyproject.toml
 mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
+mypy vllm/logging --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
+mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/prompt_adapter --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy tests --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/pyproject.toml b/pyproject.toml
index 4958aae02594a..1ba1eacd90084 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.0",
+    "torch == 2.3.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
@@ -69,7 +69,5 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "llm: run tests for vLLM API only",
-    "openai: run tests for OpenAI API only",
     "vlm: run tests for vision language models only",
 ]
diff --git a/requirements-build.txt b/requirements-build.txt
index 1a07a94e82e04..b05f38a0ed919 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.0
+torch==2.3.1
 wheel
diff --git a/requirements-common.txt b/requirements-common.txt
index 05969cfa5d65f..29643cfce161b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+transformers >= 4.42.4  # Required for Gemma 2 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
@@ -17,7 +17,8 @@ pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 8b7d86e686217..754070df21c0a 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+torch == 2.3.1+cpu; platform_machine != "ppc64le"
+torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3536179835967..3eb91212e976e 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,6 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.0
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
+torch == 2.3.1
+# These must be updated alongside torch
+torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27  # Requires PyTorch 2.3.1
+vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
diff --git a/requirements-mamba.txt b/requirements-mamba.txt
new file mode 100644
index 0000000000000..1838e87d063da
--- /dev/null
+++ b/requirements-mamba.txt
@@ -0,0 +1,3 @@
+# Mamba dependencies
+mamba-ssm>=1.2.2
+causal-conv1d>=1.2.0
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
new file mode 100644
index 0000000000000..e32c76fb0db21
--- /dev/null
+++ b/requirements-openvino.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r requirements-common.txt
+
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+optimum-intel[openvino] >= 1.18.1
+
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index cc42839a975d0..cc955e279a845 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,5 +2,9 @@
 -r requirements-common.txt
 
 # Dependencies for AMD GPUs
+awscli
+boto3
+botocore
 ray >= 2.10.0
+peft
 pytest-asyncio
diff --git a/requirements-test.txt b/requirements-test.txt
index fef0ede7be0ff..a7604d2e1015e 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,10 +14,11 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-torchvision # required for the image processor of phi3v
+sparseml==1.8.0 # required for compressed-tensors
+compressed-tensors==0.4.0 # required for compressed-tensors
 
 # Benchmarking
 aiohttp
 
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes==0.42.0
\ No newline at end of file
diff --git a/rocm_patch/rocm_bf16.patch b/rocm_patch/rocm_bf16.patch
deleted file mode 100644
index a0f07da2a3e2b..0000000000000
--- a/rocm_patch/rocm_bf16.patch
+++ /dev/null
@@ -1,15 +0,0 @@
---- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
-+++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
-@@ -90,10 +90,10 @@
- #include "math_fwd.h"              // ocml device functions
- 
- #if defined(__HIPCC_RTC__)
--#define __HOST_DEVICE__ __device__
-+#define __HOST_DEVICE__ __device__ static
- #else
- #include <climits>
--#define __HOST_DEVICE__ __host__ __device__
-+#define __HOST_DEVICE__ __host__ __device__ static inline
- #endif
- 
- // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
diff --git a/setup.py b/setup.py
index b2ae6def8cdc6..72ef26f15e405 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
 import re
 import subprocess
 import sys
+import warnings
 from shutil import which
 from typing import Dict, List
 
@@ -26,6 +27,34 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
 
+
+def embed_commit_hash():
+    try:
+        if "BUILDKITE_COMMIT" in os.environ:
+            # ci build
+            commit_id = os.environ["BUILDKITE_COMMIT"]
+        else:
+            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                                encoding="utf-8").strip()
+
+        commit_contents = f'__commit__ = "{commit_id}"\n'
+
+        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
+        with open(version_file, "w", encoding="utf-8") as f:
+            f.write(commit_contents)
+
+    except subprocess.CalledProcessError as e:
+        warnings.warn(f"Failed to get commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+    except Exception as e:
+        warnings.warn(f"Failed to embed commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+
+
+embed_commit_hash()
+
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@@ -233,6 +262,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+
+
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
@@ -337,6 +370,8 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
     elif _is_tpu():
         version += "+tpu"
     elif _is_cpu():
@@ -388,6 +423,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
@@ -396,7 +433,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
     return requirements
 
 
@@ -450,4 +488,9 @@ def _read_requirements(filename: str) -> List[str]:
     },
     cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
     package_data=package_data,
+    entry_points={
+        "console_scripts": [
+            "vllm=vllm.scripts:main",
+        ],
+    },
 )
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 52d3394a96a13..aa2b6e22208f3 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import SamplingParams
+from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 
 from ..utils import wait_for_gpu_memory_to_clear
@@ -23,8 +24,11 @@ def __init__(self):
         self.add_request_calls = 0
         self.abort_request_calls = 0
         self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
 
-    async def step_async(self):
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
         self.step_calls += 1
         return [RequestOutput(
             request_id=self.request_id)] if self.request_id else []
@@ -32,6 +36,9 @@ async def step_async(self):
     async def process_model_inputs_async(self, *args, **kwargs):
         pass
 
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
     def generate(self, request_id):
         self.request_id = request_id
 
@@ -41,6 +48,7 @@ def stop_generating(self):
     def add_request(self, **kwargs):
         del kwargs  # Unused
         self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
 
     async def add_request_async(self, **kwargs):
         self.add_request_calls += 1
@@ -53,6 +61,9 @@ def abort_request(self, request_id):
     def has_unfinished_requests(self):
         return self.request_id is not None
 
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
 
@@ -76,6 +87,7 @@ async def test_new_requests_event():
     engine.engine.generate("2")
     await asyncio.sleep(0)
     await asyncio.sleep(0)
+    await asyncio.sleep(0)
     assert engine.engine.add_request_calls == 2
     assert engine.engine.step_calls >= 2
     await asyncio.sleep(0.001)
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 55b730812ea94..aea8a7fed6e33 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -1,11 +1,10 @@
 import os
 import pathlib
-from dataclasses import dataclass
 
 import pytest
 
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
@@ -50,24 +49,9 @@
 ]
 
 
-@dataclass
-class MockTokenizer:
-    chat_template = None
-
-
-@dataclass
-class MockServingChat:
-    tokenizer: MockTokenizer
-
-
 def test_load_chat_template():
     # Testing chatml template
-    tokenizer = MockTokenizer()
-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=chatml_jinja_path)
-
-    template_content = tokenizer.chat_template
+    template_content = load_chat_template(chat_template=chatml_jinja_path)
 
     # Test assertions
     assert template_content is not None
@@ -79,24 +63,16 @@ def test_load_chat_template():
 def test_no_load_chat_template_filelike():
     # Testing chatml template
     template = "../../examples/does_not_exist"
-    tokenizer = MockTokenizer()
-
-    mock_serving_chat = MockServingChat(tokenizer)
 
     with pytest.raises(ValueError, match="looks like a file path"):
-        OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                              chat_template=template)
+        load_chat_template(chat_template=template)
 
 
 def test_no_load_chat_template_literallike():
     # Testing chatml template
     template = "{{ messages }}"
-    tokenizer = MockTokenizer()
 
-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
-    template_content = tokenizer.chat_template
+    template_content = load_chat_template(chat_template=template)
 
     assert template_content == template
 
@@ -108,9 +84,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
                         expected_output):
     # Initialize the tokenizer
     tokenizer = get_tokenizer(tokenizer_name=model)
-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    template_content = load_chat_template(chat_template=template)
 
     # Create a mock request object using keyword arguments
     mock_request = ChatCompletionRequest(
@@ -122,7 +96,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     result = tokenizer.apply_chat_template(
         conversation=mock_request.messages,
         tokenize=False,
-        add_generation_prompt=mock_request.add_generation_prompt)
+        add_generation_prompt=mock_request.add_generation_prompt,
+        chat_template=mock_request.chat_template or template_content)
 
     # Test assertion
     assert result == expected_output, (
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index cc05d79e56874..5ecd770ede836 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -1,27 +1,15 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
+def server():
+    args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "float16",
@@ -29,7 +17,10 @@ def server(ray_ctx):
         "2048",
         "--enforce-eager",
         "--engine-use-ray"
-    ])
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 805b8883b9d94..ec7c2ba3e3ce0 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,12 +8,14 @@
 import pytest
 
 from vllm import LLM
+from vllm.utils import is_hip
+
+from ..models.utils import check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 def test_vllm_gc_ed():
@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
 
 
 @pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])
@@ -35,13 +38,16 @@ def test_models(
     vllm_runner,
     example_prompts,
     model: str,
+    backend: str,
     dtype: str,
     max_tokens: int,
     enforce_eager: bool,
 ) -> None:
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+
+    if backend == "FLASHINFER" and is_hip():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -52,10 +58,9 @@ def test_models(
                      gpu_memory_utilization=0.7) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 48d6091282b89..767e0628765bd 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,6 +8,8 @@
 """
 import pytest
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
@@ -54,10 +56,9 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
new file mode 100644
index 0000000000000..9ebcc48a9b93e
--- /dev/null
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -0,0 +1,13 @@
+from vllm.utils import is_hip
+
+from ..utils import compare_two_settings
+
+
+def test_cpu_offload():
+    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
+                         ["--cpu-offload-gb", "4"])
+    if not is_hip():
+        # compressed-tensors quantization is currently not supported in ROCm.
+        compare_two_settings(
+            "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", [],
+            ["--cpu-offload-gb", "1"])
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 7f20b2d934942..7aed0d5e1fa69 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -12,6 +12,8 @@
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
 ]
@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -89,18 +91,18 @@ def test_preemption(
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
     assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
             "is not enough KV cache space." in caplog_vllm.text)
     # Ensure the count bucket of request-level histogram metrics matches
@@ -145,10 +147,10 @@ def test_swap(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                        beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
@@ -212,8 +214,8 @@ def test_swap_infeasible(
             example_prompts,
             sampling_params=sampling_params,
         )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.
     assert req_outputs[0].outputs[0].finish_reason == "length"
@@ -250,8 +252,8 @@ def test_preemption_infeasible(
             sampling_params=sampling_params,
         )
 
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
diff --git a/tests/conftest.py b/tests/conftest.py
index 67885b93285c5..7f507310cd255 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,9 @@
 import contextlib
 import gc
 import os
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+import sys
+from collections import UserList
+from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar
 
 import pytest
 import torch
@@ -9,18 +11,19 @@
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.assets.image import ImageAsset
+from vllm.config import TokenizerPoolConfig
+from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
-from vllm.utils import cuda_device_count_stateless, is_cpu
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
+                        is_cpu)
 
 logger = init_logger(__name__)
 
@@ -28,22 +31,6 @@
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
-# Multi modal related
-# You can use `.buildkite/download-images.sh` to download the assets
-PIXEL_VALUES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
-]
-IMAGE_FEATURES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
-]
-IMAGE_FILES = [
-    os.path.join(_TEST_DIR, "images", filename)
-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
-]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
-
 
 def _read_prompts(filename: str) -> List[str]:
     with open(filename, "r") as f:
@@ -51,6 +38,50 @@ def _read_prompts(filename: str) -> List[str]:
         return prompts
 
 
+class _ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+
+
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
+
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
+
+
+class _ImageAssets(_ImageAssetsBase):
+
+    def __init__(self) -> None:
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+        ])
+
+    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+        """
+        Convenience method to define the prompt for each test image.
+
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+
+
+IMAGE_ASSETS = _ImageAssets()
+"""Singleton instance of :class:`_ImageAssets`."""
+
+
+@pytest.fixture(autouse=True)
+def init_test_http_connection():
+    # pytest_asyncio may use a different event loop per test
+    # so we need to make sure the async client is created anew
+    global_http_connection.reuse_client = False
+
+
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
@@ -81,31 +112,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup()
 
 
-@pytest.fixture(scope="session")
-def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
-
-
-@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    if vision_language_config.image_input_type == (
-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
-    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
-
-
-@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
-
-
 @pytest.fixture
 def example_prompts() -> List[str]:
     prompts = []
@@ -122,11 +128,10 @@ def example_long_prompts() -> List[str]:
     return prompts
 
 
-_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-}
+@pytest.fixture(scope="session")
+def image_assets() -> _ImageAssets:
+    return IMAGE_ASSETS
+
 
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
 
@@ -147,9 +152,9 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
+        is_sparseml_model: bool = False,
     ) -> None:
-        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
-        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
         self.model_name = model_name
 
@@ -164,6 +169,9 @@ def __init__(
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
+            elif is_sparseml_model:
+                from sparseml.transformers import SparseAutoModelForCausalLM
+                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 
@@ -183,6 +191,9 @@ def __init__(
         )
 
         try:
+            # don't put this import at the top level
+            # it will call torch.cuda.device_count()
+            from transformers import AutoProcessor  # noqa: F401
             self.processor = AutoProcessor.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
@@ -198,7 +209,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
             assert len(prompts) == len(images)
@@ -233,7 +244,7 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
@@ -269,19 +280,30 @@ def generate_greedy_logprobs(
         self,
         prompts: List[str],
         max_tokens: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
-        all_logprobs = []
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        all_logprobs: List[List[torch.Tensor]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
             for hidden_states in output.hidden_states:
                 last_hidden_states = hidden_states[-1][0]
                 logits = torch.matmul(
@@ -301,20 +323,32 @@ def generate_greedy_logprobs_limit(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
         all_output_ids: List[List[int]] = []
         all_output_strs: List[str] = []
 
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+            input_ids = inputs.input_ids
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
 
             seq_logprobs: List[torch.Tensor] = []
@@ -409,7 +443,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -417,7 +451,7 @@ def generate(
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}
 
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
@@ -430,7 +464,7 @@ def generate(
             req_sample_output_strs: List[str] = []
             for sample in req_output.outputs:
                 output_str = sample.text
-                output_ids = sample.token_ids
+                output_ids = list(sample.token_ids)
                 req_sample_output_ids.append(prompt_ids + output_ids)
                 req_sample_output_strs.append(prompt_str + output_str)
             outputs.append((req_sample_output_ids, req_sample_output_strs))
@@ -440,10 +474,19 @@ def generate_w_logprobs(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
-        req_outputs = self.model.generate(prompts,
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = {"image": image}
+
+        req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
         for req_output in req_outputs:
@@ -458,7 +501,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
@@ -470,11 +513,14 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
                                                 logprobs=num_logprobs)
-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+        outputs = self.generate_w_logprobs(prompts,
+                                           greedy_logprobs_params,
+                                           images=images)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
@@ -520,6 +566,10 @@ def get_tokenizer_pool_config(tokenizer_group_type):
         return TokenizerPoolConfig(pool_size=1,
                                    pool_type="ray",
                                    extra_config={})
+    if isinstance(tokenizer_group_type, type):
+        return TokenizerPoolConfig(pool_size=1,
+                                   pool_type=tokenizer_group_type,
+                                   extra_config={})
     raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
 
 
diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index d0ca09c4be0d4..d7863a9ae1ada 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 
     # Expect consumed blocks to be new blocks required to support the new slots.
     expected_consumed_blocks = len(
-        chunk_list(
-            list(
-                range(prompt_len + num_slots_to_append + num_lookahead_slots)),
-            block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
+        list(
+            chunk_list(
+                list(
+                    range(prompt_len + num_slots_to_append +
+                          num_lookahead_slots)),
+                block_size))) - len(
+                    list(chunk_list(list(range(prompt_len)), block_size)))
     assert num_consumed_blocks == expected_consumed_blocks
 
 
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 496774c8de53c..e2391a5680b36 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -373,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
                                    block_size) - (sequence_len // block_size)
 
     original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
 
+    print("original_block_ids = {}".format(original_block_ids))
     forked_block_table = original_block_table.fork()
 
     # Expect no additional allocation (copy on _write_).
@@ -457,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
 
     # Allocate lookahead slots.
     original_block_table.ensure_num_empty_slots(lookahead_slots)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
 
     forked_block_table = original_block_table.fork()
 
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index 44a5be6c181a0..a9e38d40444a9 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -8,8 +8,8 @@
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
-                          block_size: int, allocator_type: str):
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                block_size: int, allocator_type: str):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     cpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
         for _ in range(num_cpu_blocks)
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     gpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
         for _ in range(num_gpu_blocks)
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [2])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
-                            block_size: int, allocator_type: str):
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                  block_size: int, allocator_type: str):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -58,27 +58,27 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
 
     unique_token_ids = list(
         range((num_cpu_blocks + num_gpu_blocks) * block_size))
-    gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
-                               block_size)
-    cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
-                               block_size)
+    gpu_token_ids = list(
+        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+    cpu_token_ids = list(
+        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
 
     assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     cpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
-                                     token_ids=token_ids,
-                                     device=Device.CPU)
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.CPU)
         for token_ids in cpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     gpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
-                                     token_ids=token_ids,
-                                     device=Device.GPU)
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
         for token_ids in gpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index edcdc0c7d4f98..9821ac41b8342 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -14,11 +14,11 @@ def create_allocate_lambda(allocate_type: str,
                                prev_block: Optional[Block],
                                token_ids: List[int]):
         if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
         elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
         else:
             raise ValueError()
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index fcf32cbe99472..95858268a964f 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -26,11 +26,10 @@ def test_first_block_has_correct_content_hash(seed: int, block_size: int,
         token_ids = list(range(num_to_fill))
         mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
 
-        block_with_prev = PrefixCachingBlock(
-            prev_block=None,
-            token_ids=token_ids,
-            block_size=block_size,
-            prefix_caching_allocator=mock_allocator)
+        block_with_prev = PrefixCachingBlock(prev_block=None,
+                                             token_ids=token_ids,
+                                             block_size=block_size,
+                                             allocator=mock_allocator)
 
         if is_curr_block_full:
             # Expect hash since block is full.
@@ -71,7 +70,7 @@ def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
             prev_block=previous_block,
             token_ids=token_ids,
             block_size=block_size,
-            prefix_caching_allocator=mock_allocator,
+            allocator=mock_allocator,
         )
 
         if is_curr_block_full and prev_block_has_hash:
@@ -138,7 +137,7 @@ def create_chain(block_size: int,
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=block_size,
-                prefix_caching_allocator=allocator,
+                allocator=allocator,
             )
 
             tokens_to_append = token_ids[block_number *
@@ -159,11 +158,11 @@ def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
                                prev_block: Optional[Block],
                                token_ids: List[int]):
         if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
         elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
         else:
             raise ValueError()
 
@@ -233,12 +232,13 @@ def test_allocate_immutable_ooms_many_hash(num_blocks: int,
 
         # Expect allocation with unseen hash to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable(prev_block=chain[-1],
-                                         token_ids=list(range(block_size)))
+            allocator.allocate_immutable_block(prev_block=chain[-1],
+                                               token_ids=list(
+                                                   range(block_size)))
 
         # Expect mutable allocation to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=chain[-1])
+            allocator.allocate_mutable_block(prev_block=chain[-1])
 
         # Expect allocation of exact same chain to pass.
         second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
@@ -270,7 +270,7 @@ def test_free_prevents_oom(num_blocks: int, block_size: int):
 
         # Expect mutable allocation to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=None)
+            allocator.allocate_mutable_block(prev_block=None)
 
         block_to_free = chain[-1]
 
@@ -280,11 +280,11 @@ def test_free_prevents_oom(num_blocks: int, block_size: int):
             allocator.free(block_to_free)
             assert block_to_free.block_id is None, i
 
-            new_block = allocator.allocate_mutable(prev_block=None)
+            new_block = allocator.allocate_mutable_block(prev_block=None)
             assert new_block.block_id == block_id, i
 
             with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocator.allocate_mutable(prev_block=None)
+                allocator.allocate_mutable_block(prev_block=None)
 
             block_to_free = new_block
 
@@ -376,7 +376,6 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
 
         # Create token ids that will exhaust all blocks.
         token_ids = list(range(num_blocks_to_consume * block_size))
-        blocks = list(range(num_blocks_to_consume))
 
         first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
@@ -384,9 +383,6 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
             allocator=allocator,
         )
 
-        # mark all blocks in first chain as computed
-        allocator.mark_blocks_as_computed(blocks)
-
         # After zero_point, second_chain's token_ids would be set -1, which
         # make it different from here comparing with first_chain
         zero_point = random.randint(1, len(token_ids) - 1)
@@ -424,15 +420,16 @@ def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
                                                 block_size=block_size)
         token_ids = list(range(block_size))
 
-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids)
+        block = allocator.allocate_immutable_block(prev_block=None,
+                                                   token_ids=token_ids)
 
         assert allocator._refcounter.get(block.block_id) == 1
-        m = allocator.allocate_mutable(prev_block=None)
+        m = allocator.allocate_mutable_block(prev_block=None)
 
         block_id = m.block_id
         for i in range(block_size):
             m.append_token_ids([i])
+
         # After block get promoted to immutable from mutable, if there is
         # already same content hash block, then it shall be released into
         # hashless_allocator
@@ -452,48 +449,79 @@ def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
 
         all_blocks_list = [i for i in range(num_blocks)]
         zero_ref = {i: 0 for i in range(num_blocks)}
+        one_ref = {i: 1 for i in range(num_blocks)}
         allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                 block_size=block_size)
         token_ids = list(range(num_blocks * block_size))
 
-        # now we have num_blocks free blocks in hashless allocator
-        # with internal tracking list _blocks _cached_blocks and evictor
-        # empty and block's ref shall be 0
+        # Verify initial/pre-alloc state
+
+        # Ensure all blocks are free inside hashless allocator
         assert list(allocator._hashless_allocator._free_block_indices
                     ) == all_blocks_list
-        assert len(allocator._blocks.keys()) == 0
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no cached blocks
         assert len(allocator._cached_blocks.values()) == 0
+        # Ensure no evicted blocks
         assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 0s ref counts for all blocks
         assert allocator._refcounter._refcounts == zero_ref
 
         # Allocate immutable chains with only one block residuled in
         new_block = []
         for i in range(num_blocks):
-            block = allocator.allocate_immutable(
+            block = allocator.allocate_immutable_block(
                 prev_block=None,
                 token_ids=token_ids[block_size * i:block_size * (i + 1)])
             new_block.append(block)
 
+        # Verify post-alloc state
+
+        # Ensure no blocks are free inside hashless allocator
+        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        # Ensure all blocks are tracked
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert allocator._block_tracker[block_id].active
+        # Ensure all blocks are cached (all promoted)
+        assert len(allocator._cached_blocks.values()) == num_blocks
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 1s ref counts for all blocks
+        assert allocator._refcounter._refcounts == one_ref
+
         # Free all blocks, and now all blocks shall be in the evictor
-        # there shall be no tracking data left in _blocks
+        # there shall be no tracking data left in _block_tracker
         # all blocks shall be tracked in _cached_blocks
         # all blocks' ref shall be zero
         for block in new_block:
             allocator.free(block)
 
-        assert len(allocator._blocks.keys()) == 0
+        # Verify post-free state
+
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no blocks in hashless allocator (all promoted)
         assert len(allocator._hashless_allocator._free_block_indices) == 0
+        # Ensure all blocks are cached
         assert list(allocator._cached_blocks.values()) == all_blocks_list
+        # Ensure all blocks are inside the evictor
         assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        # Ensure 0s refcounts
         assert allocator._refcounter._refcounts == zero_ref
 
         # Allocate a mutable block, and the first block shall be evicted
         # and set its content hash into None, ref to 1
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)
 
         assert mutable.block_id == 0
         assert mutable.content_hash is None
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
         assert allocator._refcounter.get(0) == 1
         assert 0 not in allocator._cached_blocks
         assert 0 not in allocator.evictor
@@ -502,27 +530,27 @@ def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
         # hashless allocator
         allocator.free(mutable)
 
-        assert len(allocator._blocks.keys()) == 0
+        assert not allocator._block_tracker[0].active
         assert allocator._refcounter._refcounts == zero_ref
         assert 0 not in allocator._cached_blocks
         assert 0 not in allocator.evictor
         assert 0 in allocator._hashless_allocator._free_block_indices
 
-        # when allocate immutable with first block_size tokens, we
+        # When allocate immutable with first block_size tokens, we
         # shall get free block from hashless allocator, thus no block left
         # in hashless
-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids[:block_size])
+        block = allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids[:block_size])
 
         assert block.block_id == 0
         assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
         assert 0 in allocator._cached_blocks.values()
         assert allocator._refcounter.get(0) == 1
         assert 0 not in allocator.evictor
 
         # allocate mutable block again, it shall be popped from evictor
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)
         assert len(allocator._hashless_allocator._free_block_indices) == 0
         assert mutable.block_id not in allocator.evictor.free_table
         assert allocator._refcounter.get(mutable.block_id) == 1
@@ -619,7 +647,7 @@ def create_immutable_chain(
             block_token_ids = token_ids[block_number *
                                         block_size:(block_number + 1) *
                                         block_size]
-            prev_block = allocator.allocate_immutable(
+            prev_block = allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=block_token_ids)
             blocks.append(prev_block)
 
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index bae958211cb7b..4ca2260b5e017 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -462,7 +462,7 @@ def test_prefill_schedule_max_lora():
                                            lora_request=LoRARequest(
                                                lora_name=str(i),
                                                lora_int_id=i + 1,
-                                               lora_local_path="abc"))
+                                               lora_path="abc"))
         waiting.append(seq_group)
     # Add two more requests to verify lora is prioritized.
     # 0: Lora, 1: Lora, 2: regular, 3: regular
@@ -760,7 +760,7 @@ def test_schedule_swapped_max_loras():
                                            lora_request=LoRARequest(
                                                lora_name=str(i),
                                                lora_int_id=i + 1,
-                                               lora_local_path="abc"))
+                                               lora_path="abc"))
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index eb423aef230cb..7a0e5673b2cc4 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -15,16 +15,18 @@
 import os
 
 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal
 
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -39,24 +41,23 @@ def test_models(
 ) -> None:
     distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
 
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    enforce_eager = backend_by_env_var == "FLASHINFER"
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=2,
-                     enforce_eager=enforce_eager,
                      distributed_executor_backend=distributed_executor_backend
                      ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 4e4e468c4377a..1ef085b933793 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -14,7 +14,10 @@
 import os
 
 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal
 
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
@@ -22,7 +25,7 @@
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -45,8 +48,10 @@ def test_models(
     enable_chunked_prefill = True
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(
             model,
@@ -59,10 +64,12 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 53654dc40d10d..7302d484954f7 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,11 @@
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import init_test_distributed_environment, multi_process_parallel
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -33,7 +32,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
         (r + 1) for r in range(tp_size)
     ]
     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[rank]
+    t = all_tensors[rank % tp_size]
     t = tensor_model_parallel_all_reduce(t)
     assert torch.allclose(t, expected)
 
@@ -61,7 +60,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
             for r in range(tp_size)
         ]
         expected = torch.cat(all_tensors, dim=all_gather_dimension)
-        t = all_tensors[rank]
+        t = all_tensors[rank % tp_size]
         t = tensor_model_parallel_all_gather(t, all_gather_dimension)
         assert torch.allclose(t, expected)
 
@@ -92,7 +91,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
 
-    if rank == 0:
+    if (rank % tp_size) == 0:
         broadcast_tensor_dict(test_dict, src=0)
     else:
         recv_dict = broadcast_tensor_dict(src=0)
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -113,4 +174,27 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel_pipeline_parallel(
+        tp_size, pp_size, test_target):
+    multi_process_parallel(tp_size, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 9a39160b8a462..3c281a45fcaf1 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -12,8 +12,7 @@
                                              get_tp_group, graph_capture)
 
 from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+                     init_test_distributed_environment, multi_process_parallel)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
new file mode 100644
index 0000000000000..8e0e8ecd675eb
--- /dev/null
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -0,0 +1,54 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+The second test will hang if more than one test is run per command, so we need
+to run the tests one by one. The solution is to pass arguments (model name) by
+environment variables.
+
+Run:
+```sh
+TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
+    test_multimodal_broadcast.py
+TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
+    test_multimodal_broadcast.py
+```
+"""
+import os
+
+import pytest
+
+from vllm.utils import cuda_device_count_stateless
+
+model = os.environ["TEST_DIST_MODEL"]
+
+if model.startswith("llava-hf/llava"):
+    from ..models.test_llava import models, run_test
+elif model.startswith("microsoft/Phi-3-vision"):
+    from ..models.test_phi3v import models, run_test
+else:
+    raise NotImplementedError(f"Unsupported model: {model}")
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets,
+                tensor_parallel_size: int, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        size_factors=[1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
new file mode 100644
index 0000000000000..3adcf6b61046d
--- /dev/null
+++ b/tests/distributed/test_parallel_state.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict
+
+import pytest
+import torch
+
+from vllm.distributed.parallel_state import (_split_tensor_dict,
+                                             _update_nested_dict)
+
+
+def test_split_tensor_dict():
+    test_dict = {
+        "key_a": "a",
+        "key_b": torch.arange(8, dtype=torch.float32),
+        "key_c": {
+            "key_1": torch.arange(5, dtype=torch.float32),
+            "key_2": torch.tensor([], dtype=torch.float32),
+            "key_3": 123,
+        },
+        "key_d": {},
+    }
+    metadata_list, tensor_list = _split_tensor_dict(test_dict)
+    assert len(metadata_list) == 6
+    assert torch.allclose(tensor_list[0], test_dict["key_b"])
+    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
+    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
+
+
+def test_split_tensor_dict_invalid_key():
+    test_dict = {
+        "a%b": "a",
+    }
+    with pytest.raises(AssertionError):
+        _split_tensor_dict(test_dict)
+
+
+def test_update_nested_dict():
+    flattened_keys_values = [("key1%key2%key3", "value1"),
+                             ("key1%key2%key4", "value2"),
+                             ("key1%key5", "value3"), ("key6%key7", "value4"),
+                             ("key8", "value5")]
+    res: Dict[str, Any] = {}
+
+    for flat_key, value in flattened_keys_values:
+        _update_nested_dict(res, flat_key, value)
+    assert res == {
+        "key1": {
+            "key2": {
+                "key3": "value1",
+                "key4": "value2"
+            },
+            "key5": "value3"
+        },
+        "key6": {
+            "key7": "value4"
+        },
+        "key8": "value5"
+    }
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
new file mode 100644
index 0000000000000..7f555ed9168a4
--- /dev/null
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -0,0 +1,63 @@
+import os
+
+import pytest
+
+from ..utils import compare_two_settings
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+@pytest.mark.parametrize(
+    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
+    [
+        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    ])
+def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
+                    DIST_BACKEND):
+    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+
+    pp_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--tensor-parallel-size",
+        str(TP_SIZE),
+        "--distributed-executor-backend",
+        DIST_BACKEND,
+    ]
+
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--tensor-parallel-size",
+        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
+        "--distributed-executor-backend",
+        "mp",
+    ]
+    if CHUNKED_PREFILL:
+        pp_args.append("--enable-chunked-prefill")
+        tp_args.append("--enable-chunked-prefill")
+    if EAGER_MODE:
+        pp_args.append("--enforce-eager")
+        tp_args.append("--enforce-eager")
+
+    compare_two_settings(MODEL_NAME, pp_args, tp_args)
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 964dbc5423e75..e0e424439e3a5 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -168,9 +168,13 @@ def send_recv_worker_fn():
                              dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     assert result == 1
 
@@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn():
                              device=device)
     with pynccl_comm.change_state(enable=True):
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     if torch.distributed.get_rank() in [0, 2]:
         assert result == 1
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 4880bab79069c..07e84d0ad54cd 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -2,10 +2,12 @@
 
 import torch
 
-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as
 
 torch.distributed.init_process_group(backend="gloo")
-test_result = is_in_the_same_node(torch.distributed.group.WORLD)
+test_result = all(
+    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
 
 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index d92900ffce00b..2761b7f6c0644 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,14 +1,23 @@
 import multiprocessing
 import random
 import time
+from typing import List
 
+import numpy as np
 import torch.distributed as dist
 
-from vllm.distributed.device_communicators.shm_broadcast import (
-    ShmRingBuffer, ShmRingBufferIO)
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.utils import update_environment_variables
 
 
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
 def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes = []
@@ -46,37 +55,34 @@ def wrapped_fn(env):
 @worker_fn_wrapper
 def worker_fn():
     writer_rank = 2
-    broadcaster = ShmRingBufferIO.create_from_process_group(
-        dist.group.WORLD, 1024, 2, writer_rank)
+    broadcaster = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank)
     if dist.get_rank() == writer_rank:
-        time.sleep(random.random())
-        broadcaster.broadcast_object(0)
-        time.sleep(random.random())
-        broadcaster.broadcast_object({})
-        time.sleep(random.random())
-        broadcaster.broadcast_object([])
+        seed = random.randint(0, 1000)
+        dist.broadcast_object_list([seed], writer_rank)
     else:
-        time.sleep(random.random())
-        a = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        b = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        c = broadcaster.broadcast_object(None)
-        assert a == 0
-        assert b == {}
-        assert c == []
+        recv = [None]
+        dist.broadcast_object_list(recv, writer_rank)
+        seed = recv[0]  # type: ignore
+    dist.barrier()
+    # in case we find a race condition
+    # print the seed so that we can reproduce the error
+    print(f"Rank {dist.get_rank()} got seed {seed}")
+    # test broadcasting with about 400MB of data
+    N = 10_000
+    if dist.get_rank() == writer_rank:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            broadcaster.broadcast_object(x)
+            time.sleep(random.random() / 1000)
+    else:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            y = broadcaster.broadcast_object(None)
+            assert np.array_equal(x, y)
+            time.sleep(random.random() / 1000)
     dist.barrier()
 
 
 def test_shm_broadcast():
     distributed_run(worker_fn, 4)
-
-
-def test_singe_process():
-    buffer = ShmRingBuffer(1, 1024, 4)
-    reader = ShmRingBufferIO(buffer, reader_rank=0)
-    writer = ShmRingBufferIO(buffer, reader_rank=-1)
-    writer.enqueue([0])
-    writer.enqueue([1])
-    assert reader.dequeue() == [0]
-    assert reader.dequeue() == [1]
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 49d11daca9aec..a51a9909f6f41 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,8 +1,8 @@
-import os
-
 import ray
 
-from vllm.utils import cuda_device_count_stateless
+import vllm.envs as envs
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
 
 
 @ray.remote
@@ -12,16 +12,16 @@ def get_count(self):
         return cuda_device_count_stateless()
 
     def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     def get_cuda_visible_devices(self):
-        return os.environ["CUDA_VISIBLE_DEVICES"]
+        return envs.CUDA_VISIBLE_DEVICES
 
 
 def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
-
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
         num_gpus=2).remote()
     assert sorted(ray.get(
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 4f32a622546f0..88f3fad4c79f8 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
         stop_checker=stop_checker,
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
         stop_checker=stop_checker,
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
         stop_checker=stop_checker,
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
         stop_checker=stop_checker,
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
index f795403e3d8ad..0d84443c51f99 100644
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
 @pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
     ("This text ends with EOS token", "</s>", 2),
 ])
-@pytest.mark.parametrize("ignore_eos", [True, False, None])
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False, None])
+@pytest.mark.parametrize("ignore_eos", [True, False])
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 @pytest.mark.skip_global_cleanup
 def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
                            ignore_eos: bool, include_stop_str_in_output: bool):
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
new file mode 100644
index 0000000000000..bff0fc99ed022
--- /dev/null
+++ b/tests/engine/test_custom_executor.py
@@ -0,0 +1,91 @@
+import asyncio
+import os
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.sampling_params import SamplingParams
+
+
+class Mock:
+    ...
+
+
+class CustomGPUExecutor(GPUExecutor):
+
+    def execute_model(self, *args, **kwargs):
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().execute_model(*args, **kwargs)
+
+
+class CustomGPUExecutorAsync(GPUExecutorAsync):
+
+    async def execute_model_async(self, *args, **kwargs):
+        with open(".marker", "w"):
+            ...
+        return await super().execute_model_async(*args, **kwargs)
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_type_checking(model):
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(model=model,
+                                 distributed_executor_backend=Mock)
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=model,
+                                      distributed_executor_backend=Mock)
+        AsyncLLMEngine.from_engine_args(engine_args)
+    with pytest.raises(TypeError):
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        AsyncLLMEngine.from_engine_args(engine_args)
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor(model, tmpdir):
+    cwd = os.path.abspath(".")
+    os.chdir(tmpdir)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = EngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_async(model, tmpdir):
+    cwd = os.path.abspath(".")
+    os.chdir(tmpdir)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
+
+        asyncio.run(t())
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
diff --git a/tests/entrypoints/llm/__init__.py b/tests/entrypoints/llm/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/llm/test_encode.py
similarity index 98%
rename from tests/entrypoints/test_llm_encode.py
rename to tests/entrypoints/llm/test_encode.py
index 7c3fbe43a8384..d1056a0490509 100644
--- a/tests/entrypoints/test_llm_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -5,7 +5,7 @@
 
 from vllm import LLM, EmbeddingRequestOutput, PoolingParams
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
@@ -25,8 +25,6 @@
     [1000, 1003, 1001, 1002],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/llm/test_generate.py
similarity index 98%
rename from tests/entrypoints/test_llm_generate.py
rename to tests/entrypoints/llm/test_generate.py
index a00fff91a310e..57ac37f7ea8f7 100644
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -5,7 +5,7 @@
 
 from vllm import LLM, RequestOutput, SamplingParams
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "facebook/opt-125m"
 
@@ -23,8 +23,6 @@
     [0, 3, 1, 2],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
similarity index 94%
rename from tests/entrypoints/test_llm_generate_multiple_loras.py
rename to tests/entrypoints/llm/test_generate_multiple_loras.py
index b429b904c7c35..35eabf079964a 100644
--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,7 +7,7 @@
 from vllm import LLM
 from vllm.lora.request import LoRARequest
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
@@ -20,8 +20,6 @@
 
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
@@ -44,7 +42,7 @@ def llm():
     cleanup()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
diff --git a/tests/entrypoints/openai/__init__.py b/tests/entrypoints/openai/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
new file mode 100644
index 0000000000000..0837644f26bde
--- /dev/null
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,69 @@
+import pytest
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
new file mode 100644
index 0000000000000..2c721d9ba7609
--- /dev/null
+++ b/tests/entrypoints/openai/test_basic.py
@@ -0,0 +1,61 @@
+from http import HTTPStatus
+
+import openai
+import pytest
+import requests
+
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+async def test_show_version(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+
+    response = requests.get(base_url + "/version")
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
+
+
+@pytest.mark.asyncio
+async def test_check_health(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+
+    response = requests.get(base_url + "/health")
+
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.asyncio
+async def test_log_metrics(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+
+    response = requests.get(base_url + "/metrics")
+
+    assert response.status_code == HTTPStatus.OK
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/openai/test_chat.py
similarity index 54%
rename from tests/entrypoints/test_openai_server.py
rename to tests/entrypoints/openai/test_chat.py
index c22a675ff1230..1abaa01ae192a 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,17 +6,12 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import torch
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -24,73 +19,10 @@
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
-    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
-    "Swift", "Kotlin"
-]
-
-pytestmark = pytest.mark.openai
-
-
-@pytest.fixture(scope="session")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+    args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -101,14 +33,17 @@ def server(zephyr_lora_files, ray_ctx):
         "--enable-lora",
         "--lora-modules",
         f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
         "2",
         "--max-num-seqs",
         "128",
-    ])
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
@@ -116,155 +51,6 @@ def client(server):
     return server.get_async_client()
 
 
-async def test_check_models(client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # first test base model, then test loras
@@ -433,40 +219,6 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
@@ -587,196 +339,56 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             stream_options={"include_usage": True})
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-                # for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
+# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
+# (i.e. using the same ordering as in the Completions API tests), the test
+# will fail on the second `guided_decoding_backend` even when I swap their order
+# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str):
-    completion = await client.completions.create(
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str,
+                                  sample_guided_choice):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {TEST_SCHEMA}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in sample_guided_choice
 
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in sample_guided_choice
+    assert choice1 != choice2
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                guided_decoding_backend: str):
+                                guided_decoding_backend: str,
+                                sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -785,18 +397,18 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        extra_body=dict(guided_json=sample_json_schema,
                         guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": message.content})
     messages.append({
@@ -809,41 +421,21 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        extra_body=dict(guided_json=sample_json_schema,
                         guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
     assert json1["name"] != json2["name"]
     assert json1["age"] != json2["age"]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 guided_decoding_backend: str):
+                                 guided_decoding_backend: str, sample_regex):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -851,17 +443,17 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         "role":
         "user",
         "content":
-        f"Give an example IP address with this regex: {TEST_REGEX}"
+        f"Give an example IP address with this regex: {sample_regex}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
+        extra_body=dict(guided_regex=sample_regex,
                         guided_decoding_backend=guided_decoding_backend))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
-    assert re.fullmatch(TEST_REGEX, ip1) is not None
+    assert re.fullmatch(sample_regex, ip1) is not None
 
     messages.append({"role": "assistant", "content": ip1})
     messages.append({"role": "user", "content": "Give me a different one"})
@@ -869,39 +461,16 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
+        extra_body=dict(guided_regex=sample_regex,
                         guided_decoding_backend=guided_decoding_backend))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
-    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert re.fullmatch(sample_regex, ip2) is not None
     assert ip1 != ip2
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in TEST_CHOICE
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str):
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -911,52 +480,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         "content":
         "The best language for type-safe systems programming is "
     }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice1 = chat_completion.choices[0].message.content
-    assert choice1 in TEST_CHOICE
 
-    messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice2 = chat_completion.choices[0].message.content
-    assert choice2 in TEST_CHOICE
-    assert choice1 != choice2
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
     with pytest.raises(openai.BadRequestError):
         _ = await client.chat.completions.create(model=MODEL_NAME,
                                                  messages=messages,
@@ -965,18 +489,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                                      2: "C++"
                                                  }))
 
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           guided_decoding_backend: str):
+                                           guided_decoding_backend: str,
+                                           sample_guided_choice):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -992,7 +511,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=TEST_CHOICE,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert chat_completion.choices[0].logprobs is not None
@@ -1008,7 +527,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_named_tool_use(client: openai.AsyncOpenAI,
-                              guided_decoding_backend: str):
+                              guided_decoding_backend: str,
+                              sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -1017,7 +537,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     # non-streaming
@@ -1031,7 +551,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name",
                 "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
+                "parameters": sample_json_schema
             }
         }],
         tool_choice={
@@ -1044,7 +564,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
     json1 = json.loads(json_string)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": json_string})
     messages.append({
@@ -1065,7 +585,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name",
                 "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
+                "parameters": sample_json_schema
             }
         }],
         tool_choice={
@@ -1090,7 +610,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
     # finish reason should only return in last block
     assert finish_reason_count == 1
     json2 = json.loads("".join(output))
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
     assert json1["name"] != json2["name"]
     assert json1["age"] != json2["age"]
 
@@ -1098,7 +618,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str,
+        sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -1107,7 +628,7 @@ async def test_required_tool_use_not_yet_supported(
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     with pytest.raises(openai.BadRequestError):
@@ -1120,7 +641,7 @@ async def test_required_tool_use_not_yet_supported(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice="required")
@@ -1135,7 +656,7 @@ async def test_required_tool_use_not_yet_supported(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice="auto")
@@ -1143,8 +664,9 @@ async def test_required_tool_use_not_yet_supported(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_inconsistent_tool_choice_and_tools(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  guided_decoding_backend: str,
+                                                  sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -1153,7 +675,7 @@ async def test_inconsistent_tool_choice_and_tools(
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     with pytest.raises(openai.BadRequestError):
@@ -1178,7 +700,7 @@ async def test_inconsistent_tool_choice_and_tools(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice={
@@ -1276,76 +798,6 @@ async def test_custom_role(client: openai.AsyncOpenAI):
     assert content1 == content2
 
 
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
-    simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=simple_sql_grammar))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(simple_sql_grammar)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
 @pytest.mark.asyncio
 async def test_long_seed(client: openai.AsyncOpenAI):
     for seed in [
@@ -1364,7 +816,3 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
         assert ("greater_than_equal" in exc_info.value.message
                 or "less_than_equal" in exc_info.value.message)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
new file mode 100644
index 0000000000000..0896e337b5d24
--- /dev/null
+++ b/tests/entrypoints/openai/test_completion.py
@@ -0,0 +1,684 @@
+# imports for guided decoding tests
+import json
+import re
+import shutil
+from tempfile import TemporaryDirectory
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def zephyr_pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name,num_virtual_tokens",
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should not appear in tokenized prompt
+    assert "vllm" not in completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora and 1 pa hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/openai/test_embedding.py
similarity index 69%
rename from tests/entrypoints/test_openai_embedding.py
rename to tests/entrypoints/openai/test_embedding.py
index 2496d2ac3e97d..2ca0c0d63c25c 100644
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,26 +1,17 @@
+import base64
+
+import numpy as np
 import openai
 import pytest
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
-pytestmark = pytest.mark.openai
-
-
-@pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
 
 @pytest.fixture(scope="module")
-def embedding_server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        EMBEDDING_MODEL_NAME,
+def embedding_server():
+    args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -28,7 +19,10 @@ def embedding_server(ray_ctx):
         "--max-model-len",
         "8192",
         "--enforce-eager",
-    ])
+    ]
+
+    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.mark.asyncio
@@ -111,3 +105,33 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+                                      model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    responses_float = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float")
+
+    responses_base64 = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64")
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding),
+                          dtype="float").tolist())
+
+    assert responses_float.data[0].embedding == decoded_responses_base64_data[
+        0]
+    assert responses_float.data[1].embedding == decoded_responses_base64_data[
+        1]
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py
similarity index 57%
rename from tests/entrypoints/test_guided_processors.py
rename to tests/entrypoints/openai/test_guided_processors.py
index fb32a9d155bc0..85cb4d52200c3 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
@@ -10,61 +10,17 @@
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
 
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-pytestmark = pytest.mark.openai
-
-
-def test_guided_logits_processors():
+def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
-    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
-    json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+    json_LP = JSONLogitsProcessor(sample_json_schema,
                                   tokenizer,
                                   whitespace_pattern=None)
 
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+        f"Give an example IPv4 address with this regex: {sample_regex}")
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     regex_LP(token_ids, tensor)
@@ -72,7 +28,8 @@ def test_guided_logits_processors():
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     json_LP(token_ids, tensor)
@@ -82,13 +39,14 @@ def test_guided_logits_processors():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
-async def test_guided_logits_processor_black_box(backend: str):
+async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+                                                 sample_json_schema):
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+        f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = CompletionRequest(model='test',
                                       prompt=token_ids,
-                                      guided_regex=TEST_REGEX)
+                                      guided_regex=sample_regex)
     regex_lp = await get_guided_decoding_logits_processor(
         backend, regex_request, tokenizer)
     assert regex_lp is not None
@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
     json_request = CompletionRequest(model='test',
                                      prompt=token_ids,
-                                     guided_json=TEST_SCHEMA)
+                                     guided_json=sample_json_schema)
     json_lp = await get_guided_decoding_logits_processor(
         backend, json_request, tokenizer)
     assert json_lp is not None
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
new file mode 100644
index 0000000000000..c2cfff228c546
--- /dev/null
+++ b/tests/entrypoints/openai/test_models.py
@@ -0,0 +1,60 @@
+import openai  # use the official client for correctness check
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
similarity index 97%
rename from tests/entrypoints/test_server_oot_registration.py
rename to tests/entrypoints/openai/test_oot_registration.py
index 3e55d7f4297fb..dbbda6de1fa09 100644
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,7 +1,6 @@
 import sys
 import time
 
-import pytest
 import torch
 from openai import OpenAI, OpenAIError
 
@@ -10,8 +9,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.utils import get_open_port
 
-pytestmark = pytest.mark.openai
-
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
similarity index 90%
rename from tests/entrypoints/test_openai_run_batch.py
rename to tests/entrypoints/openai/test_run_batch.py
index 5de28513ca391..b25e2a26e2d82 100644
--- a/tests/entrypoints/test_openai_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -6,7 +6,8 @@
 
 # ruff: noqa: E501
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index c45f02fe564a3..464465494b714 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,15 +1,11 @@
 import asyncio
 from dataclasses import dataclass
 
-import pytest
-
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 
-pytestmark = pytest.mark.openai
-
 
 @dataclass
 class MockModelConfig:
@@ -36,11 +32,13 @@ async def _async_serving_chat_init():
                                            model_config,
                                            served_model_names=[MODEL_NAME],
                                            response_role="assistant",
-                                           chat_template=CHAT_TEMPLATE)
+                                           chat_template=CHAT_TEMPLATE,
+                                           lora_modules=None,
+                                           prompt_adapters=None,
+                                           request_logger=None)
     return serving_completion
 
 
 def test_async_serving_chat_init():
     serving_completion = asyncio.run(_async_serving_chat_init())
-    assert serving_completion.tokenizer is not None
-    assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
+    assert serving_completion.chat_template == CHAT_TEMPLATE
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
new file mode 100644
index 0000000000000..18c51c560b511
--- /dev/null
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -0,0 +1,152 @@
+import openai  # use the official client for correctness check
+import pytest
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def tokenizer_name(model_name: str,
+                   zephyr_lora_added_tokens_files: str):  # noqa: F811
+    return zephyr_lora_added_tokens_files if (
+        model_name == "zephyr-lora2") else model_name
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_completions(client: openai.AsyncOpenAI,
+                                    model_name: str, tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "vllm1 This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
+                             tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [{
+                "role": "user",
+                "content": "Hi there!"
+            }, {
+                "role": "assistant",
+                "content": "Nice to meet you!"
+            }, {
+                "role": "user",
+                "content": "Can I ask a question? vllm1"
+            }]
+
+            prompt = tokenizer.apply_chat_template(
+                add_generation_prompt=add_generation,
+                conversation=conversation,
+                tokenize=False)
+            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+            response = requests.post(base_url + "/tokenize",
+                                     json={
+                                         "add_generation_prompt":
+                                         add_generation,
+                                         "add_special_tokens": add_special,
+                                         "messages": conversation,
+                                         "model": model_name
+                                     })
+            response.raise_for_status()
+
+            assert response.json() == {
+                "tokens": tokens,
+                "count": len(tokens),
+                "max_model_len": 8192
+            }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
+                          tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    prompt = "This is a test prompt. vllm1"
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    print(f"CALLING {base_url} FOR {model_name}")
+    response = requests.post(base_url + "/detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+
+    assert response.json() == {"prompt": prompt}
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/openai/test_vision.py
similarity index 88%
rename from tests/entrypoints/test_openai_vision.py
rename to tests/entrypoints/openai/test_vision.py
index 03dc5d1161f0e..843ba91f7a076 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,19 +1,16 @@
-from pathlib import Path
-from typing import Dict
+from typing import Dict, List
 
 import openai
 import pytest
-import pytest_asyncio
-import ray
 
-from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
-                       "examples/template_llava.jinja")
+LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
 assert LLAVA_CHAT_TEMPLATE.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -22,37 +19,21 @@
     "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
-pytestmark = pytest.mark.openai
-
-
-@pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
 
 @pytest.fixture(scope="module")
 def server():
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
+    args = [
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "4096",
         "--enforce-eager",
-        "--image-input-type",
-        "pixel_values",
-        "--image-token-id",
-        "32000",
-        "--image-input-shape",
-        "1,3,336,336",
-        "--image-feature-size",
-        "576",
         "--chat-template",
         str(LLAVA_CHAT_TEMPLATE),
-    ])
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
@@ -60,11 +41,10 @@ def client(server):
     return server.get_async_client()
 
 
-@pytest_asyncio.fixture(scope="session")
-async def base64_encoded_image() -> Dict[str, str]:
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> Dict[str, str]:
     return {
-        image_url:
-        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
+        image_url: encode_image_base64(fetch_image(image_url))
         for image_url in TEST_IMAGE_URLS
     }
 
@@ -216,7 +196,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -279,7 +259,3 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
     )
     completion = completion.choices[0].text
     assert completion is not None and len(completion) >= 0
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
new file mode 100644
index 0000000000000..cec2b05bafd21
--- /dev/null
+++ b/tests/kernels/quant_utils.py
@@ -0,0 +1,72 @@
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+def ref_dynamic_per_token_quant(x: torch.tensor,
+                                quant_dtype: torch.dtype,
+                                scale_ub: Optional[torch.tensor] = None) \
+        -> Tuple[torch.tensor, torch.tensor]:
+
+    assert quant_dtype in [torch.int8, torch.float8_e4m3fn]
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
+            else torch.finfo(quant_dtype)
+    qtype_max = as_float32_tensor(qtype_traits.max)
+    s_1 = as_float32_tensor(1.0)
+    s_512 = as_float32_tensor(512.0)
+
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+
+    # Compute scales
+    x_token_max, _ = x.abs().max(dim=-1)
+    x_token_max = as_float32_tensor(x_token_max)
+    if scale_ub is not None:
+        x_token_max = x_token_max.clamp(max=scale_ub)
+    scales = (x_token_max / qtype_max)[:, None]
+
+    # Quant
+    if quant_dtype == torch.int8:
+        iscales = as_float32_tensor(s_1 / scales)
+        torch_out = as_float32_tensor(x) * iscales
+        torch_out = torch_out.round()
+        torch_out = torch_out.clamp(qtype_traits.min,
+                                    qtype_traits.max).to(quant_dtype)
+    else:
+        assert quant_dtype == torch.float8_e4m3fn
+        min_scaling_factor = s_1 / (qtype_max * s_512)
+        scales = scales.clamp(min=min_scaling_factor)
+        torch_out = as_float32_tensor(x) / scales
+        torch_out = torch_out.clamp(qtype_traits.min,
+                                    qtype_traits.max).to(quant_dtype)
+
+    return torch_out, scales
+
+
+# The int8 version is very similar. Incorporate the int8 version, like in
+# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
+# kernel
+def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
+                    -> Tuple[torch.tensor, torch.tensor]:
+
+    fp8_traits = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = as_float32_tensor(fp8_traits.max)
+    one = as_float32_tensor(1.0)
+
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+
+    x_max = as_float32_tensor(x.abs().max())
+    ref_scale = x_max / fp8_max
+    ref_iscale = one / ref_scale
+    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
+        fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn)
+    return ref_out, ref_scale
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index f848ad51c7014..2e6412c28958e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -175,7 +175,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
@@ -193,7 +193,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
     elif version == "v2":
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
@@ -224,7 +225,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
     else:
         raise AssertionError(f"Unknown version: {version}")
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 79e03c7478de0..d9404e6442616 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -9,8 +9,8 @@
 
 
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
 def test_env(name: str, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch):
             backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                         torch.float16, 16)
         assert backend.name == "ROCM_FLASH"
+    elif device == "openvino":
+        with patch("vllm.attention.selector.is_openvino", return_value=True):
+            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+                                        torch.float16, 16)
+        assert backend.name == "OPENVINO"
     else:
         backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                     torch.float16, 16)
@@ -42,32 +47,32 @@ def test_flash_attn(monkeypatch):
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
         backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-        assert backend.name != "FLASH_ATTN"
+        assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
     backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
     backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
     backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported sliding window
     backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
         backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-        assert backend.name != "FLASH_ATTN"
+        assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
     backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 402545d1980d6..b3adb152949a2 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -212,7 +212,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
     tp_rank = 0
 
     # Call the paged attention kernel.
@@ -231,7 +231,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
             tp_rank=tp_rank,
             blocksparse_local_blocks=blocksparse_local_blocks,
             blocksparse_vert_stride=blocksparse_vert_stride,
@@ -267,7 +268,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
             tp_rank=tp_rank,
             blocksparse_local_blocks=blocksparse_local_blocks,
             blocksparse_vert_stride=blocksparse_vert_stride,
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 23b6baa60c05b..70ae3d0c6e0c3 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -155,11 +155,11 @@ def test_reshape_and_cache(
         cloned_value_cache = value_cache.clone()
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
 
     # Call the reshape_and_cache kernel.
     ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, kv_scale)
+                          kv_cache_dtype, k_scale, v_scale)
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 4d09cd8ce9c63..d409df34ee5e5 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,18 +2,19 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Type
+from typing import Optional, Type
 
 import pytest
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-capability = torch.cuda.get_device_capability()
+capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
 
@@ -27,11 +28,27 @@ def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
                             per_token_act_quant: bool,
                             per_out_channel_weight_quant: bool,
+                            use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -42,16 +59,19 @@ def cutlass_fp8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
 
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b * b.to(dtype=torch.float32)).to(out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    assert torch.allclose(out, baseline, rtol=1e-2, atol=1e-1)
+    assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
 
 
 def cutlass_int8_gemm_helper(m: int,
@@ -59,6 +79,7 @@ def cutlass_int8_gemm_helper(m: int,
                              k: int,
                              per_token_act_quant: bool,
                              per_out_channel_weight_quant: bool,
+                             use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -69,15 +90,18 @@ def cutlass_int8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
 
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=out_dtype)
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
@@ -87,11 +111,12 @@ def cutlass_int8_gemm_helper(m: int,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                          per_out_ch: bool, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -99,49 +124,72 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                           per_out_ch: bool, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                        out_dtype: Type[torch.dtype]):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             out_dtype)
+                                        out_dtype: Type[torch.dtype],
+                                        use_bias: bool):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             use_bias,
+                             out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                       out_dtype: Type[torch.dtype]):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                            out_dtype)
+                                       out_dtype: Type[torch.dtype],
+                                       use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            per_act_token,
+                            per_out_ch,
+                            use_bias,
+                            out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                  device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
+                                  use_bias: bool, device: str):
+    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
                             torch.bfloat16, device)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                   device: str):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             torch.bfloat16, device)
+                                   use_bias: bool, device: str):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             use_bias,
+                             out_dtype=torch.bfloat16,
+                             device=device)
 
 
 # For the following two tests:
@@ -151,20 +199,26 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # kernel must handle any M thrown at it.
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                  use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                    use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                     use_bias)
 
 
 # Test working with a subset of A and B
@@ -185,9 +239,11 @@ def test_cutlass_subset():
                                 scale_a,
                                 scale_b,
                                 out_dtype=torch.bfloat16)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
 
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
new file mode 100644
index 0000000000000..f25e7d480b6b3
--- /dev/null
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -0,0 +1,953 @@
+"""
+Tests:
+
+* E2E test of Encoder attention + Decoder self-attention +
+      Encoder/decoder cross-attention (collectively
+      "encoder/decoder attention")
+* Confirm enc/dec models will fail for chunked prefill
+* Confirm enc/dec models will fail for prefix caching
+
+"""
+
+from typing import NamedTuple, Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import *
+from tests.kernels.utils import make_causal_mask, maybe_make_long_tensor
+from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend, AttentionType
+from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
+from vllm.utils import is_hip
+
+HEAD_SIZES = [64, 256]
+
+NUM_HEADS = [1, 16]
+
+BATCH_SIZES = [1, 16]
+BLOCK_SIZES = [16]
+BACKEND_NAMES = [STR_XFORMERS_ATTN_VAL]
+CUDA_DEVICE = "cuda:0"
+
+MAX_DEC_SEQ_LENS = [128]
+MAX_ENC_SEQ_LENS = [128]
+
+# Narrow teest-cases for unsupported-scenario
+# tests
+HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
+
+
+class TestPoint(NamedTuple):
+    """
+    Encapsulates the attributes which define a single invocation
+    of the test_e2e_enc_dec_attn() test
+
+    Attributes:
+        num_heads: The number of heads in the model.
+        head_size: Head dimension
+        backend_name: Name of the backend framework used.
+        batch_size: Number of samples per batch.
+        block_size: Size of each block of data processed.
+        max_dec_seq_len: Maximum sequence length for the decoder.
+        max_enc_seq_len: Maximum sequence length for the encoder.
+        num_blocks: Number of blocks in the model.
+    """
+
+    num_heads: int
+    head_size: int
+    backend_name: str
+    batch_size: int
+    block_size: int
+    max_dec_seq_len: int
+    max_enc_seq_len: int
+    num_blocks: int
+
+
+class TestResources(NamedTuple):
+    '''
+    Encapsulates key components for performing an
+    encoder/decoder attention test
+
+    Note that
+    (1) attn automatically selects an attention backend
+        based on platform info & a set of canned
+        heuristics
+    (2) attn_backend is thus *not the same backend
+        instance* used by attn, but rather it is
+        intended to be a
+        *different instance* of the *same backend class*;
+        it is assumed that the user of TestResources
+        will leverage attn_backend for the purpose of
+        constructing backend-compatible attention
+        metadata instances
+   
+    Attributes:
+
+    * scale: 1/sqrt(d) scale factor for attn
+    * attn_backend: implementatino of abstraction
+                    attention interface using
+                    a particular kernel library
+                    i.e. XFormers
+    * attn: Attention layer instance
+    * kv_cache: shared key/value cache for all attention
+    '''
+
+    scale: float
+    attn_backend: AttentionBackend
+    attn: Attention
+    kv_cache: torch.Tensor
+
+
+def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
+    '''
+    Build key components for performing encoder/decoder attention test.
+
+    Note that
+    (1) The Attention instance constructed here, automatically selects 
+        an attention backend class based on platform info & a set of canned
+        heuristics, so
+    (2) The attention backend instance constructed here is thus *not 
+        the same backend instance* used by attn, but rather it is
+        intended to be a *different instance* of the *same backend class*;
+        therefore,
+    (3) This function requires that test_pt.backend_name matches the backend
+        class that Attention will automatically select when it is constructed.
+
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: num_heads, head_size, num_blocks,
+               block_size, backend_name
+
+    Returns:
+
+    * TestResources data structure.
+    '''
+
+    scale = float(1.0 / (test_pt.head_size**0.5))
+    attn_backend = make_backend(test_pt.backend_name)
+    attn = Attention(
+        test_pt.num_heads,
+        test_pt.head_size,
+        scale=scale,
+    )
+    if test_pt.num_blocks is None or test_pt.num_heads is None:
+        # Caller does not require a KV cache
+        return TestResources(scale, attn_backend, attn, None)
+
+    # Construct KV cache
+    kv_cache = make_kv_cache(test_pt.num_blocks,
+                             test_pt.num_heads,
+                             test_pt.head_size,
+                             test_pt.block_size,
+                             device=CUDA_DEVICE)
+    return TestResources(scale, attn_backend, attn, kv_cache)
+
+
+def _encoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+) -> PhaseTestParameters:
+    '''
+    Set up test vectors & data structures for encoder attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed. 
+    Given this is an encoder attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    The query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate an ideal output tensor.
+
+    Encoder inference does not populate the KV cache, therefore
+    no KV cache memory mapping is constructed
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+
+    
+    Returns:
+    
+    * PhaseTestParameters data structure comprising (1) packed query/key/value
+      tensors, (2) the ideal output of attention computed using a naive
+      implementation, and (3) KVCache field set to None
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        _,
+        _,
+        max_q_seq_len,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Make test tensors
+
+    qkv_in, _, _ = make_qkv(batch_size,
+                            max_q_seq_len,
+                            max_kv_seq_len,
+                            num_heads,
+                            head_size,
+                            attn_type=AttentionType.ENCODER,
+                            device=CUDA_DEVICE)
+
+    # Compute correct answer using naive non-causal attention
+    # implementation
+
+    ideal_output = ref_masked_attention(qkv_in.query,
+                                        qkv_in.key,
+                                        qkv_in.value,
+                                        scale=scale,
+                                        q_seq_lens=qkv_in.q_seq_lens,
+                                        kv_seq_lens=qkv_in.kv_seq_lens)
+
+    packed_ideal_output, _ = pack_tensor(ideal_output,
+                                         qkv_in.q_seq_lens,
+                                         device=CUDA_DEVICE)
+
+    packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
+
+    return PhaseTestParameters(
+        PackedQKVO(packed_qkv, packed_ideal_output),
+        None  # No KV cache
+    )
+
+
+def _decoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
+    '''
+    Set up test vectors & data structures for self-attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed ("baseline"
+    query/key/value). Given this is a self-attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    "Prefill" query/key/value tensors are derived by masking out the last value
+    in each baseline query/key/value. These tensors are used to test prefill &
+    populate KV cache for a subsequent decode test.
+
+    "Decode" query/key/value tensors are derived by extracting *only* the last
+    value from each baseline query/key/value (i.e. complement of the prefill
+    tensors.) These tensors are used to test decode, conditional on the kv cache
+    being populated during the prefill test.
+
+    The baseline query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate a "Baseline" ideal output tensor.
+    This tensor is split into the "Prefill" ideal output tensor (all but the
+    last element of each output sequence) and the "Decode" ideal output tensor
+    (*only* the last element of each output sequence); the "Prefill" and
+    "Decode" ideal output tensors can be used to validate the prefill and decode
+    test results, respectively.
+
+    This function also constructs the self-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x
+           head_size) query/key/value tensors
+    * Prefill-phase decoder self-attention PhaseTestParameters data structure,
+      including (1) packed (number_of_tokens x num_heads x head_size) 
+      query/key/value tensors along with (2) ideal attention output
+      computed using a naive implementation, and (3) memory-mapping data 
+      structures appropriate for prefill phase.
+    * Decode-phase decoder self-attention PhaseTestParameters data structure, 
+      including (1) packed (number_of_tokens x num_heads x head_size) 
+      query/key/value tensors along with (2) ideal attention output 
+      computed using a naive implementation, and (3) memory-mapping data 
+      structures appropriate for decode phase.
+    * max_block_idx: max physical address in decoder self-attention block-table
+                     (intended to be used as the base address for the encoder/
+                      decoder cross-attention block-table, which is not
+                      constructed in this function)
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_q_seq_len,
+        _,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Build test tensors
+
+    (
+        qkv,
+        prefill_qkv,
+        decode_qkv,
+    ) = make_qkv(batch_size,
+                 max_q_seq_len,
+                 max_kv_seq_len,
+                 num_heads,
+                 head_size,
+                 attn_type=AttentionType.DECODER,
+                 device=CUDA_DEVICE)
+
+    # Compute correct answer using naive attention implementation
+    # with causal attention mask
+
+    causal_mask = make_causal_mask(max_q_seq_len,
+                                   max_kv_seq_len).to(CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(qkv.query,
+                                        qkv.key,
+                                        qkv.value,
+                                        scale=scale,
+                                        custom_mask=causal_mask,
+                                        q_seq_lens=qkv.q_seq_lens,
+                                        kv_seq_lens=qkv.kv_seq_lens)
+
+    # Split out the prefill- & decode-phase ideal answers & pack them
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_qkv.q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for decoder self-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with entries for prompt tokens
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks
+    #   required by total num. tokens in the entirety of all sequences
+    #   (including both prefill & decode)
+    # * Slot-mapping with entries for tokens that will be decoded in the
+    #   current decode iteration
+    #
+    #  Note: the format described above is simply mirroring what ModelRunner
+    #        produces
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        slot_mapping_list,
+        max_block_idx,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       qkv.q_seq_lens,
+                                       device=CUDA_DEVICE,
+                                       block_base_addr=block_base_addr)
+
+    (
+        prefill_slot_mapping,
+        decode_slot_mapping,
+    ) = split_slot_mapping(slot_mapping_list,
+                           qkv.q_seq_lens,
+                           device=CUDA_DEVICE)
+
+    prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
+
+    decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE)
+
+    return (
+        qkv,
+        PhaseTestParameters(  # Prefill test params
+            PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode test params
+            PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)),
+        max_block_idx)
+
+
+def _enc_dec_cross_attn_setup_reuses_query(
+    decoder_qkv: QKVInputs,
+    encoder_test_params: PhaseTestParameters,
+    prefill_decoder_phase_test_params: PhaseTestParameters,
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> Tuple[PhaseTestParameters, PhaseTestParameters]:
+    '''
+    Set up test vectors & data structures for cross-attention test.
+
+    A triplet of synthetic cross-attention key/value tensors are constructed
+    ("baseline" key/value). Given this is a cross-attention test, we assume
+    query tensors were already synthesized for a prior self-attention test and
+    will be reused for cross-attention. The key & value sequences generated here
+    may have a different length than the corresponding queries (as is often
+    the case for cross-attention between decoder and encoder sequences.)
+
+    Cross attention key & value tensors do not grow during autoregressive
+    inference; thus this function obtains a single key/value pair suitable for
+    both prefill and decode.
+
+    The "baseline" query tensor is received as an argument. The "baseline"
+    query/key/value tensors are passed to an ideal reference cross-attention
+    implementation to generate a "baseline" ideal output tensor. This tensor is
+    split into the "Prefill" ideal output tensor (all but the last element of
+    each output sequence) and the "Decode" ideal output tensor (*only* the last
+    element of each output sequence); the "Prefill" and "Decode" ideal output
+    tensors can be used to validate the prefill and decode test results,
+    respectively.
+
+    This function also constructs the cross-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr. 
+
+    Arguments:
+
+    * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
+                   num_heads x head_size) decoder self-attention inputs; 
+                   this function relies on the query and q_seq_lens
+                   fields
+    * encoder_test_params: PhaseTestParameters data structure which was
+                           used for encoder inference; KV cache field
+                           is not used by this function
+    * prefill_decoder_phase_test_params: PhaseTestParameters data structure
+                                         used for prefill-phase decoder
+                                         self-attention; all fields
+                                         including KV cache required
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+
+    * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data 
+      structure, including (1) packed 
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a 
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for prefill phase.
+    * Decode-phase encoder/decoder cross-attention PhaseTestParameters data 
+      structure, including (1) packed
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a 
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for decode phase.
+    '''
+
+    assert encoder_test_params.packed_qkvo.packed_qkv is not None
+    assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_decoder_seq_len,
+        max_encoder_seq_len,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    decoder_query = decoder_qkv.query
+    decoder_seq_lens = decoder_qkv.q_seq_lens
+    encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+    prefill_q_seq_lens = (
+        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
+
+    assert prefill_q_seq_lens is not None
+
+    (
+        cross_kv,
+        _,
+        _,
+    ) = make_qkv(batch_size,
+                 max_decoder_seq_len,
+                 max_encoder_seq_len,
+                 num_heads,
+                 head_size,
+                 force_kv_seq_lens=encoder_seq_lens,
+                 attn_type=AttentionType.ENCODER_DECODER,
+                 device=CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(decoder_query,
+                                        cross_kv.key,
+                                        cross_kv.value,
+                                        scale=scale,
+                                        q_seq_lens=decoder_seq_lens,
+                                        kv_seq_lens=cross_kv.kv_seq_lens)
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for encoder/decoder cross-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Whereas decoder self-attention extracts relationships between
+    # equal-length Q/K/V sequences, which mutually grow in length
+    # with each decoded token, cross-attention relates the Q sequence
+    # - which grows with each new decoded token - to fixed-length
+    # K and V sequences derived from the encoder hidden states.
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with as many entries as there are tokens in the encoder
+    #   prompt.
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks to
+    #   accommodate K & V tensors which are equal in lnegth
+    #   to the encoder prompt length
+    # * Empty slot-mapping tensor (since K & V are fixed in size,
+    #   new decoded tokens are not KV-cached and require no slot-
+    #   mapping)
+    #
+    # Note: the format above is simply an extension of what ModelRunner
+    #       produces for decoder-only models
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+    decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        prefill_slot_mapping_list,
+        _,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       cross_kv.kv_seq_lens,
+                                       block_base_addr=block_base_addr,
+                                       device=CUDA_DEVICE)
+
+    prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
+                                                  device=CUDA_DEVICE)
+
+    # Packed key/value (query is already provided)
+    packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
+
+    return (
+        PhaseTestParameters(  # Prefill-phase test params
+            PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode-phase test params
+            PackedQKVO(None, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)))
+
+
+def _run_encoder_attention_test(
+    attn: Attention,
+    encoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run encoder attention.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER in order 
+    to configure the kernel invocation for encoder attention
+
+    Requires attn_metadata.num_decode_tokens == 0
+    (There is no encoder execution in the decode-phase)
+
+    Arguments:
+
+    * attn: Attention wrapper instance
+    * encoder_test_params: encoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query/key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+
+    Returns:
+    * Attention.forward() applied to packed {query,key,value} and
+      & attn_metadata
+    '''
+    assert attn_metadata.num_decode_tokens == 0
+    attn_type = AttentionType.ENCODER
+    packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    return attn.forward(packed_qkv.query,
+                        packed_qkv.key,
+                        packed_qkv.value,
+                        None,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+def _run_decoder_self_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run decoder self-attention test.
+
+    attn.forward() is passed attn_type=AttentionType.DECODER
+    in order to configure the kernel invocation for decoder self-attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query/key/value fields
+    * attn_metadata: attention metadata for decoder-self attention
+                     (contains KV cache memory-mapping)
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    attn_type = AttentionType.DECODER
+    attn = test_rsrcs.attn
+    kv_cache = test_rsrcs.kv_cache
+    packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    return attn.forward(packed_qkv.query,
+                        packed_qkv.key,
+                        packed_qkv.value,
+                        kv_cache,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+def _run_encoder_decoder_cross_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    cross_test_params: Optional[PhaseTestParameters],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run encoder/decoder cross-attention test.
+
+    Via PhaseTestParameters data structures, consumes the same query utilized
+    for decoder self-attention, plus a key/value specific to cross-attention.
+
+    if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
+    is None, this reflects that in decode-phase cross attention there
+    is no growth in the key and value tensors.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
+    in order to configure the kernel invocation for encoder/decoder cross-
+    attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query field
+    * cross_test_params: encoder/decoder PhaseTestParameters data structure;
+                         this function relies on the packed
+                         (number_of_tokens x num_heads x head_size) 
+                         key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    assert decoder_test_params.packed_qkvo.packed_qkv is not None
+
+    attn_type = AttentionType.ENCODER_DECODER
+    attn = test_rsrcs.attn
+    kv_cache = test_rsrcs.kv_cache
+    if cross_test_params is None:
+        key = None
+        value = None
+    else:
+        cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
+        key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
+        value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
+    return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
+                        key,
+                        value,
+                        kv_cache,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
+                      batch_size: int, block_size: int, max_dec_seq_len: int,
+                      max_enc_seq_len: int, monkeypatch):
+
+    # Force Attention wrapper backend
+    override_backend_env_variable(monkeypatch, backend_name)
+
+    # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+    # to be more than necessary, since exceeding the kv cache size
+    # is not part of this test
+    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
+                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+
+    # Attention scale factor, attention backend instance, attention wrapper
+    # instance, KV cache init
+    test_rsrcs = _make_test_resources(test_pt)
+
+    # Construct encoder attention test params (only used
+    # during prefill)
+
+    enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+
+    # Shared prefill metadata structure
+
+    prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        True,
+        None,
+        decoder_test_params=None,
+        encoder_test_params=enc_test_params,
+        cross_test_params=None,
+        device=CUDA_DEVICE)
+
+    # PREFILL: encoder attention
+
+    enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
+        test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
+
+    # - Is encoder attention result correct?
+    assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+
+
+@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_e2e_enc_dec_attn(
+    num_heads: int,
+    head_size: int,
+    backend_name: str,
+    batch_size: int,
+    block_size: int,
+    max_dec_seq_len: int,
+    max_enc_seq_len: int,
+    monkeypatch,
+) -> None:
+    '''
+    End-to-end encoder/decoder test:
+
+    * Construct fake test vectors for (1) encoder attention,
+      (2) decoder self-attention, and (3) encoder/decoder cross-attention
+    * Construct (1) attention metadata structure with self- and cross-attention
+      attributes for prefill-phase, and (2) an analogous attention metadata
+      structure but for decode-phase
+    * Test attention steps in the following order
+    
+        * Encoder attention
+        * Prefill self-attention
+        * Prefill cross-attention
+        * Decode self-attention
+        * Decode cross-attention
+        * Besides being reflective of realistic use-cases, this order would 
+          exacerbate any accidental overlap in the self-/cross-attention 
+          block tables, which one hopes to avoid
+
+
+    * Validate output correctness against ideal reference attention
+      implementation
+
+    Block tables are constructed such that cross-attention KV cache is in a
+    higher, non-intersecting address-space than self-attention KV cache.
+
+    Self- and cross-attention share the same query tensor but not the K/V
+    tensors. Self-attention K/Vs must have the same seq len as Q while
+    cross-attention K/Vs are allowed to differ in seq len, as is often the case
+    for cross-attention.
+
+    This test utilizes PyTest monkey patching to force the attention backend
+    via an environment variable.
+
+    Note on ROCm/HIP: currently encoder/decoder models are not supported on
+    AMD GPUs, therefore this test simply is skipped if is_hip(). 
+
+    Note on metadata: there is a single attention metadata structure shared by
+    all prefill-phase attention operations (encoder, decoder, enc/dec cross), 
+    and a single one shared by all decode-phase attention operations
+    (decoder & enc/dec cross.) This is intended to reflect the behavior
+    of ModelRunner, which constructs a single attention metadata structure for
+    each prefill or decode run. A realistic scenario would rely on the
+    attention backend to utilize the appropriate attention metadata fields
+    according to the value of attn_metadata.attention_type. Thus, this test is
+    organized so as to confirm that the backend-under-test can handle a
+    shared prefill attention metadata structure & a shared decode attention
+    metadata structure.
+    '''
+
+    # Force Attention wrapper backend
+    override_backend_env_variable(monkeypatch, backend_name)
+
+    # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+    # to be more than necessary, since exceeding the kv cache size
+    # is not part of this test
+    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
+                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+
+    # Attention scale factor, attention backend instance, attention wrapper
+    # instance, KV cache init
+    test_rsrcs = _make_test_resources(test_pt)
+
+    # Construct encoder attention test params (only used
+    # during prefill)
+
+    enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+
+    # Construct Decoder self-attention prefill-phase & decode-phase
+    # test params, including query/key/value tensors, decoder self-attention
+    # memory-mapping. cross_block_base_addr is the uppermost address in the
+    # decoder self-attention block-table, i.e. a base address which the
+    # encoder/decoder cross-attention block-table may build downward toward.
+
+    (
+        dec_qkv,
+        prephase_dec_test_params,
+        decphase_dec_test_params,
+        cross_block_base_addr,
+    ) = _decoder_attn_setup(test_pt, test_rsrcs)
+
+    # Construct encoder/decoder cross-attention prefill-phase & decode-phase
+    # test params, including key/value tensors, cross-attention memory-mapping
+
+    (
+        prephase_cross_test_params,
+        decphase_cross_test_params,
+    ) = _enc_dec_cross_attn_setup_reuses_query(
+        dec_qkv,
+        enc_test_params,
+        prephase_dec_test_params,
+        test_pt,
+        test_rsrcs,
+        block_base_addr=cross_block_base_addr)
+
+    # Shared prefill metadata structure
+    assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
+    prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        True,
+        prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
+        decoder_test_params=prephase_dec_test_params,
+        encoder_test_params=enc_test_params,
+        cross_test_params=prephase_cross_test_params,
+        device=CUDA_DEVICE)
+
+    # PREFILL: encoder attention
+
+    enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
+                                                   enc_test_params,
+                                                   prephase_attn_metadata)
+
+    # - Is encoder attention result correct?
+    assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+
+    # PREFILL: decoder self-attention test
+
+    prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
+        test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
+
+    # - Is prefill decoder self-attention correct?
+    assert_actual_matches_ideal(prephase_dec_test_params,
+                                prephase_dec_pckd_act_out)
+
+    # PREFILL: encoder/decoder cross-attention test
+
+    prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+        test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
+        prephase_attn_metadata)
+
+    # - Is prefill encoder/decoder cross-attention correct?
+    assert_actual_matches_ideal(prephase_cross_test_params,
+                                prephase_cross_pckd_act_out)
+
+    # DECODE: build decode-phase attention metadata
+
+    decphase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        False,
+        dec_qkv.q_seq_lens,
+        decoder_test_params=decphase_dec_test_params,
+        encoder_test_params=enc_test_params,
+        cross_test_params=decphase_cross_test_params,
+        device=CUDA_DEVICE)
+
+    # DECODE: decoder self-attention test
+
+    decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
+        test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
+
+    # - Is decode-phase decoder self-attention correct?
+    assert_actual_matches_ideal(decphase_dec_test_params,
+                                decphase_dec_pckd_act_out)
+
+    # DECODE: encoder/decoder cross-attention test
+
+    decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+        test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
+
+    # - Is decode-phase encoder/decoder cross-attention correct?
+    assert_actual_matches_ideal(decphase_cross_test_params,
+                                decphase_cross_pckd_act_out)
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
new file mode 100644
index 0000000000000..5211be6aef009
--- /dev/null
+++ b/tests/kernels/test_flashinfer.py
@@ -0,0 +1,248 @@
+from typing import List, Optional, Tuple
+
+import flashinfer
+import pytest
+import torch
+
+NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: List[int],
+    kv_lens: List[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: List[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
+                                         num_heads: Tuple[int,
+                                                          int], head_size: int,
+                                         dtype: torch.dtype, block_size: int,
+                                         soft_cap: Optional[float]) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+    wrapper.begin_forward(kv_indptr,
+                          kv_indices,
+                          kv_last_page_lens,
+                          num_query_heads,
+                          num_kv_heads,
+                          head_size,
+                          block_size,
+                          "NONE",
+                          data_type=dtype)
+
+    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
+                                          num_heads: Tuple[int, int],
+                                          head_size: int, dtype: torch.dtype,
+                                          block_size: int,
+                                          soft_cap: Optional[float]) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    # Normalize the scale of the key and value caches to mitigate
+    # numerical instability.
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD")
+    wrapper.begin_forward(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+    )
+
+    output = wrapper.forward(
+        query,
+        key_value_cache,
+        logits_soft_cap=soft_cap,
+    )
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=query_lens,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py
new file mode 100644
index 0000000000000..61ff54db560da
--- /dev/null
+++ b/tests/kernels/test_fp8_quant.py
@@ -0,0 +1,87 @@
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import (ref_dynamic_per_tensor_fp8_quant,
+                                       ref_dynamic_per_token_quant)
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
+HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
+NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+SCALE_UBS = [True, False]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
+                                     dtype: torch.dtype, scale_ub: bool,
+                                     seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
+                   device="cuda") + 1e-6  # avoid nans
+
+    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
+            if scale_ub else None
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn,
+                                                      scale_ub)
+    ops_out, ops_scales = ops.scaled_fp8_quant(x,
+                                               scale_ub=scale_ub,
+                                               use_per_token_if_dynamic=True)
+
+    assert torch.allclose(ref_scales, ops_scales)
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
+                                      dtype: torch.dtype, seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+
+    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, ops_scale = ops.scaled_fp8_quant(x)
+
+    assert torch.allclose(ref_scale, ops_scale)
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+
+
+# Regression test for a case with large activations where an int32 index cannot
+# represent the number of elements.
+@torch.inference_mode()
+@pytest.mark.parametrize("seed", SEEDS)
+def test_fp8_quant_large(seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
+    hidden_size = 1152  # Smallest hidden_size to reproduce the error
+    dtype = torch.bfloat16
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, _ = ops.scaled_fp8_quant(x, scale)
+
+    # Minimize memory footprint in this test by freeing x and upconverting
+    # the outputs in place. (torch.allclose does not support fp8)
+    del x
+    ref_out = ref_out.to(dtype=dtype)
+    ops_out = ops_out.to(dtype=dtype)
+
+    assert torch.allclose(ref_out, ops_out)
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 0daf7439468aa..03acbf7968ff1 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -3,6 +3,8 @@
 
 # ruff: noqa: F401
 import vllm._C
+from tests.kernels.quant_utils import ref_dynamic_per_token_quant
+from vllm._custom_ops import scaled_int8_quant
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
@@ -21,23 +23,16 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                    dtype: torch.dtype, seed: int) -> None:
     torch.random.manual_seed(seed)
     torch.cuda.manual_seed(seed)
-    int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
 
-    x_token_max, _ = x.max(dim=1)
-    x_token_max = x_token_max.to(dtype=torch.float32)
-    scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
-                                                      dtype=torch.float32)
-    torch_out = (x / scales).round().clamp(int8_traits.min,
-                                           int8_traits.max).to(torch.int8)
-
-    ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
-    scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
-    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+    # reference
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
+    # kernel
+    ops_out, ops_scales = scaled_int8_quant(x)
 
-    assert torch.allclose(scales_out, scales)
-    assert torch.allclose(torch_out, ops_out,
+    assert torch.allclose(ops_scales, ref_scales)
+    assert torch.allclose(ops_out, ref_out,
                           atol=1)  # big atol to account for rounding errors
 
 
@@ -55,12 +50,11 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+    scale = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
     out1 = (x / scale).round().clamp(int8_traits.min,
                                      int8_traits.max).to(torch.int8)
-    out2 = torch.empty_like(x, dtype=torch.int8)
-    scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
+    out2, _ = scaled_int8_quant(x, scale)
 
-    torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
     assert torch.allclose(out1, out2,
                           atol=1)  # big atol to account for rounding errors
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 1f8d94bad26d9..42087fdcce959 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,20 +5,25 @@
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
-    marlin_perm)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, compute_max_diff, is_marlin_supported, marlin_24_quantize,
-    marlin_quantize, marlin_weights)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
+    marlin_make_empty_g_idx, marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    pack_fp8_to_int32)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
+    marlin_weights)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
+    awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
+    sort_weights)
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -38,21 +43,28 @@
     (67, 13, 11),
 ]
 
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
 
-def rand_data(shape):
-    return torch.randn(shape, dtype=torch.half, device="cuda")
 
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
 
-@pytest.mark.skipif(not is_marlin_supported(),
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
-                       mnk_factors):
+def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
+                            mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -90,8 +102,8 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits,
-                                  marlin_perm[num_bits])
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -106,16 +118,64 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
     assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
+                           mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Create input
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize
+    w_ref, q_w, s, zp = quantize_weights_with_zp(b_weight, num_bits,
+                                                 group_size)
+
+    # Pack to AWQ format
+    q_w_awq = awq_pack(q_w, num_bits, size_k, size_n)
+
+    # Pack to Marlin format
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+
+    # Run Marlin repack GPU kernel
+    marlin_q_w_2 = ops.awq_marlin_repack(
+        q_w_awq,
+        size_k,
+        size_n,
+        num_bits,
+    )
+    torch.cuda.synchronize()
+
+    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
-def test_marlin_gemm(
+def test_gptq_marlin_gemm(
     k_chunk,
     n_chunk,
     num_bits,
@@ -145,6 +205,8 @@ def test_marlin_gemm(
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
         b_weight, num_bits, group_size, act_order)
 
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
 
@@ -152,6 +214,7 @@ def test_marlin_gemm(
         a_input,
         marlin_q_w,
         marlin_s,
+        marlin_zp,
         g_idx,
         sort_indices,
         workspace.scratch,
@@ -160,6 +223,7 @@ def test_marlin_gemm(
         b_weight.shape[1],
         a_input.shape[1],
         is_k_full,
+        has_zp=False,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -171,14 +235,15 @@ def test_marlin_gemm(
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
 @pytest.mark.parametrize("num_bits", GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
+                             mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -217,3 +282,139 @@ def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
     print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", [8])
+@pytest.mark.parametrize("group_size", [-1])
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_fp8_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+    dtype,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+
+    a_input = rand_data((size_m, size_k), dtype=dtype)
+    b_weight = rand_data((size_k, size_n), dtype=dtype)
+
+    # WEIGHTS
+    fp8_weight, weight_scale = ops.scaled_fp8_quant(b_weight, scale=None)
+    # Repack weights to gptq format (packed int32 elements)
+    packed_gptq_qweight = pack_fp8_to_int32(fp8_weight)
+    # Repack weights to marlin format
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_gptq_qweight,
+        perm=torch.empty(0, dtype=torch.int, device="cuda"),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+
+    # WEIGHT SCALES
+    # Currently Marlin doesn't support per-tensor scales, so we
+    # expand it to channelwise
+    scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
+    # Permute scales
+    marlin_scales = marlin_permute_scales(s=scales,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=-1)
+
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.fp8_marlin_gemm(
+        a=a_input,
+        b_q_weight=marlin_qweight,
+        b_scales=marlin_scales,
+        workspace=workspace.scratch,
+        num_bits=num_bits,
+        size_m=a_input.shape[0],
+        size_n=b_weight.shape[1],
+        size_k=a_input.shape[1],
+    )
+    output_ref = torch.matmul(a_input, b_weight)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+
+    assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_awq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+        b_weight, num_bits, group_size)
+
+    g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
+    sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
+    is_k_full = True
+    has_zp = True
+
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_q_w,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace.scratch,
+        num_bits,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full,
+        has_zp,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+
+    assert max_diff < 0.04
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2356b9ec18b0d..2f9eee420f270 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -29,7 +29,7 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", [8, 64])
@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
     for i in range(config.num_local_experts):
         weights = (hf_moe.experts[i].w1.weight.data,
                    hf_moe.experts[i].w3.weight.data)
-        vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
-        vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+        vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
     # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
     hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index b401eb87d3ec3..23d627820d247 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,12 +1,211 @@
 """Kernel test utils"""
 
+import itertools
+import random
+from numbers import Number
+from typing import Any, List, NamedTuple, Optional, Tuple, Union
+
 import pytest
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.xformers import XFormersBackend
+from vllm.utils import make_tensor_with_pad
 
+# String name of register which may be set in order to
+# force auto-selection of attention backend by Attention
+# wrapper
 STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
+
+# Possible string values of STR_BACKEND_ENV_VAR
+# register, corresponding to possible backends
+STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
+STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
+STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH"
+STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
 
+class QKVInputs(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs, 
+    query/key/values and their sequence lengths.
+
+    Attributes:
+
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+                             num_heads x head_size) attention inputs
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_seq_lens: List[int]
+    kv_seq_lens: List[int]
+
+
+class QKVO(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs, 
+    alongside unpacked known-correct attention output
+
+    Attributes:
+
+        * qkv: unpacked (batch_size x padded_seq_len x 
+                             num_heads x head_size) attention inputs
+        * ideal_output: unpacked (batch_size x padded_seq_len x 
+                        num_heads x head_size) known-correct attention output
+    '''
+
+    qkv: QKVInputs
+    ideal_output: torch.Tensor
+
+
+class PackedQKVInputs(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs
+
+    Attributes:
+
+        * {query,key,value}: packed (number_of_tokens x num_heads 
+                             x head_size) attention inputs
+        * q_start_loc_list: list of query start locations within packed tensor
+        * kv_start_loc_list: shared list of key/value start locations within
+                             packed tensor
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_start_loc_list: Optional[List[int]]
+    kv_start_loc_list: Optional[List[int]]
+    q_seq_lens: Optional[List[int]]
+    kv_seq_lens: Optional[List[int]]
+
+
+class PackedQKVO(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs, 
+    alongside packed known-correct attention output
+
+    Attributes:
+
+        * packed_qkv: packed (number_of_tokens x num_heads 
+                      x head_size) attention inputs
+        * ideal_output: packed (number_of_tokens x num_heads 
+                        x head_size) known-correct attention output
+    '''
+
+    packed_qkv: Optional[PackedQKVInputs]
+    ideal_output: torch.Tensor
+
+
+class KVMemoryMap(NamedTuple):
+    '''
+    Data structure for encapsulating KV cache memory mapping.
+
+    Attributes:
+
+        * block_tables: KV cache block tables
+        * slot_mapping: mapping of sequence offset to physical address
+    '''
+
+    block_tables: torch.Tensor
+    slot_mapping: torch.Tensor
+
+
+class PhaseTestParameters(NamedTuple):
+    '''
+    Data structure for encapsulating the test parameters
+    for a given test "phase" (prefill or decode phase) and attention
+    scenario (encoder, decoder-self, encoder/decoder-cross)
+
+    Attributes:
+
+        * packed_qkvo: packed (number_of_tokens x num_heads 
+                       x head_size) attention inputs & known-correct
+                       output
+        * kv_mmap: KV cache memory mapping, specific to this test phase &
+                   attention scenario
+    '''
+
+    packed_qkvo: PackedQKVO
+    kv_mmap: Optional[KVMemoryMap]
+
+
+def maybe_make_int_tensor(
+    _list: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D int torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D int torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.int, device=device)
+
+
+def maybe_make_long_tensor(
+    _list: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D long torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D long torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.long, device=device)
+
+
+def maybe_max(_list: Optional[List]) -> Optional[Number]:
+    '''
+    Returns:
+
+    * If _list is not None: max(_list)
+    * None otherwise
+    '''
+    return None if _list is None else max(_list)
+
+
+def make_causal_mask(
+    q_max_seq_len: int,
+    kv_max_seq_len: int,
+) -> torch.Tensor:
+    '''
+    Create a q_max_seq_len x kv_max_seq_len causal mask
+
+    Arguments:
+    
+    * q_max_seq_len: query max seq len
+    * kv_max_seq_len: key/value max seq len
+
+    Returns:
+
+    * 2D tensor, q_max_seq_len x kv_max_seq_len
+    '''
+
+    # Create a matrix where entry (i, j) is True if i >= j
+    mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
+    # Replace True with float('-inf') and False with 0
+    mask = mask.masked_fill(mask == 1,
+                            float('-inf')).masked_fill(mask == 0, 0.0)
+    return mask
+
+
 def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
                                   backend_name: str) -> None:
     '''
@@ -20,3 +219,724 @@ def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
     * backend_name: attention backend name to force
     '''
     mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
+
+
+def ref_masked_attention(query: torch.Tensor,
+                         key: torch.Tensor,
+                         value: torch.Tensor,
+                         scale: float,
+                         custom_mask: Optional[torch.Tensor] = None,
+                         q_seq_lens: Optional[List] = None,
+                         kv_seq_lens: Optional[List] = None) -> torch.Tensor:
+    '''
+    "Golden" masked attention reference. Supports two types of masking:
+
+    * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
+      padding elements
+    * Custom attention mask, which can force an arbitrary mask tensor, i.e.
+      causal
+
+    Arguments:
+
+    * query: batch_size x q_padded_seq_len x num_heads x head_size
+    * key: batch_size x kv_padded_seq_len x num_heads x head_size
+    * value: batch_size x kv_padded_seq_len x num_heads x head_size
+    * scale: Attention scale factor
+    * custom_mask: custom attention mask; good place to inject a causal
+      attention mask
+    * q_seq_lens: list of unpadded query seq_lens for each batch index
+    * kv_seq_lens: list of unpadded key/value seq_lens for each batch index
+
+    Returns:
+
+    * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
+    '''
+
+    assert q_seq_lens is not None
+    assert kv_seq_lens is not None
+
+    batch_size = query.shape[0]
+    assert (len(q_seq_lens) == batch_size)
+    assert (len(kv_seq_lens) == batch_size)
+
+    attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
+
+    # Basic attention mask, derived from seq lens
+    if (q_seq_lens is not None) or (kv_seq_lens is not None):
+        attn_mask = torch.zeros_like(attn_weights)
+        if q_seq_lens is not None:
+            for bdx, plen in enumerate(q_seq_lens):
+                attn_mask[bdx, :, plen:, :] = -torch.inf
+        if kv_seq_lens is not None:
+            for bdx, plen in enumerate(kv_seq_lens):
+                attn_mask[bdx, :, :, plen:] = -torch.inf
+
+        attn_weights = attn_weights + attn_mask.float()
+
+    # Custom attention mask
+    if custom_mask is not None:
+        attn_weights = attn_weights + custom_mask.float()
+
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+    return out
+
+
+def make_qkv(
+    batch_size: int,
+    max_q_seq_len: int,
+    max_kv_seq_len: Optional[int],
+    num_heads: int,
+    head_size: int,
+    device: Union[torch.device, str],
+    force_kv_seq_lens: Optional[List[int]] = None,
+    attn_type: AttentionType = AttentionType.ENCODER_DECODER,
+    force_max_len: bool = False,
+) -> Tuple[QKVInputs, QKVInputs, QKVInputs]:
+    '''
+    Construct QKV test tensors for self- and cross-attention.
+
+    Generates three query/key/value triplets:
+
+    * "Baseline" query/key/value (for input to reference attention function)
+    * "Prefill" query/key/value (last sequence offset zero'd out, for use as
+      input to prefill kernel)
+    * "Decode" query/key/value (only the last sequence offset  from baseline,
+      for use as input to decode kernel)
+
+    Each Q/K/V triplet is associated with a list of q seqlens and a list of k/v
+    seqlens
+
+    Arguments:
+
+    * batch_size
+    * max_q_seq_len: max query seq len
+    * max_kv_seq_len: max key/value seq len
+    * num_heads
+    * head_size
+    * is_encoder_decoder_attn: if True, query seqlen may differ from 
+      key/value seqlen (as is often the case for cross-attention); 
+      o/w, query/key/value seqlens match at each batch index 
+      (max_kv_seq_len is unused)
+    * force_kv_seq_lens: if not None, overrides kv sequence lengths
+    * attn_type: encoder, decoder self, or enc/dec cross attention
+    * force_max_len: if True, all query seqlens are max_q_seq_len; o/w query
+      seqlens are random in [2,max_q_seq_lens]. Same for key/value seqlens
+      and max_kv_seq_len, unless forced by is_encoder_decoder_attn=False
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
+    * Prefill QKVInputs structure (containing all but the last sequence offset)
+    * Decode QKVInputs structure (containing all only the last sequence offset)
+    '''
+
+    if force_max_len:
+        q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
+    else:
+        q_seq_lens = [
+            random.randint(2, max_q_seq_len) for _ in range(batch_size)
+        ]
+    kv_seq_lens = None
+    if force_kv_seq_lens is not None:
+        kv_seq_lens = force_kv_seq_lens
+    elif attn_type != AttentionType.ENCODER_DECODER:
+        # K,V seq lens match Q for self-attention
+        kv_seq_lens = q_seq_lens
+    else:
+        # K,V seq lens are distinct from Q seq lens & random
+        assert max_kv_seq_len is not None
+        if force_max_len:
+            kv_seq_lens = [max_kv_seq_len] * batch_size
+        else:
+            kv_seq_lens = [
+                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
+            ]
+
+    query = torch.rand(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    key = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    value = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    prefill_query = torch.zeros(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    prefill_key = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    prefill_value = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    decode_query = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+    decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
+    decode_value = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+
+    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
+                                                      kv_seq_lens)):
+        query[bdx, q_seq_len:, :, :] = 0
+        key[bdx, kv_seq_len:, :, :] = 0
+        value[bdx, kv_seq_len:, :, :] = 0
+
+        prefill_query[bdx,
+                      0:(q_seq_len - 1), :, :] = query[bdx,
+                                                       0:(q_seq_len - 1), :, :]
+        prefill_key[bdx,
+                    0:(kv_seq_len - 1), :, :] = key[bdx,
+                                                    0:(kv_seq_len - 1), :, :]
+        prefill_value[bdx, 0:(kv_seq_len -
+                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
+
+        decode_query[bdx, :, :, :] = query[bdx,
+                                           (q_seq_len - 1):q_seq_len, :, :]
+        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
+        decode_value[bdx, :, :, :] = value[bdx,
+                                           (kv_seq_len - 1):kv_seq_len, :, :]
+
+    prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
+    prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
+
+    decode_q_seq_lens = [1 for _ in q_seq_lens]
+    decode_kv_seq_lens = [1 for _ in kv_seq_lens]
+
+    return (
+        QKVInputs(
+            query,  # Overall QKV inputs
+            key,
+            value,
+            q_seq_lens,
+            kv_seq_lens),
+        QKVInputs(
+            prefill_query,  # Prefill subset of QKV sequences
+            prefill_key,
+            prefill_value,
+            prefill_q_seq_lens,
+            prefill_kv_seq_lens),
+        QKVInputs(
+            decode_query,  # Decode subset of KV sequences
+            decode_key,
+            decode_value,
+            decode_q_seq_lens,
+            decode_kv_seq_lens))
+
+
+def pack_tensor(
+        unpacked_tensor: torch.Tensor, seq_lens: List[int],
+        device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]:
+    '''
+    Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
+    unpadded number_of_tokens x num_heads x head_size tensor, where
+    number_of_tokens = sum(seq_lens)
+
+    Arguments:
+
+    * unpacked_tensor: batch_size x padded_seq_len x num_heads x head_size
+    * seq_lens: list of token counts for each seq
+    * device: CPU or CUDA device
+
+    Returns
+
+    * packed_tensor: number_of_tokens x num_heads x head_size
+    * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
+      list(itertools.accumulate(seq_lens))
+    '''
+
+    num_tok = sum(seq_lens)
+    num_heads = unpacked_tensor.shape[-2]
+    head_size = unpacked_tensor.shape[-1]
+    start_loc_list = [0] + list(itertools.accumulate(seq_lens))
+    packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
+
+    for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
+
+        packed_tensor[start_loc:(
+            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
+
+    return packed_tensor, start_loc_list
+
+
+def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
+                                           str]) -> PackedQKVInputs:
+    '''
+    Individually pack each of Q, K and V, each with dimensions batch_size x
+    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
+    num_heads x head_size tensors.
+    
+    For Q, number_of_tokens = sum(q_seq_lens).
+
+    For K and V, number_of_tokens = sum(kv_seq_lens)
+
+    Arguments:
+
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size)
+           attention inputs
+    * device: CPU or CUDA device
+
+    Returns
+
+    * Packed (number_of_tokens x num_heads x head_size) QKV inputs
+      derived from unpacked inputs
+    '''
+
+    if qkv.query is None:
+        packed_query = None
+        q_start_loc_list = None
+    else:
+        packed_query, q_start_loc_list = pack_tensor(qkv.query,
+                                                     qkv.q_seq_lens,
+                                                     device=device)
+    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
+                                                qkv.kv_seq_lens,
+                                                device=device)
+    packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
+    return PackedQKVInputs(
+        packed_query, packed_key, packed_value, q_start_loc_list,
+        kv_start_loc_list,
+        (None if q_start_loc_list is None else qkv.q_seq_lens),
+        qkv.kv_seq_lens)
+
+
+def make_backend(backend_name: str) -> AttentionBackend:
+    '''
+    Construct the backend instance determined by the backend_name string
+    argument.
+
+    "XFORMERS" -> construct xformers backend
+
+    TODO: other backends
+
+    Note: at time of writing the Attention wrapper automatically selects
+    its own backend for Attention.forward(); so the backend instance which
+    you generate with this function is not meant to be used for *running*
+    inference, but rather for generating compatible metadata structures
+    using backend.make_metadata()
+
+
+    Returns:
+
+    * Backend instance
+    '''
+    if backend_name == STR_XFORMERS_ATTN_VAL:
+        return XFormersBackend()
+    raise AssertionError(
+        f"Unrecognized backend_name {backend_name} for unit test")
+
+
+def _make_metadata_tensors(
+    seq_lens: Optional[List[int]], context_lens: Optional[List[int]],
+    encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str]
+) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]],
+           torch.Tensor, Optional[int]]:
+    '''
+    Build scalar & tensor values required to build attention metadata structure.
+
+    Arguments:
+
+    * seq_lens: list of token-counts for each decoder input seq
+    * context_lens: list of context length values for each seq
+    * encoder_seq_lens: list of token-counts for each encoder input seq
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * seq_lens_tensor: decoder seq_lens list, as tensor
+    * context_lens_tensor: context_lens list, as tensor
+    * max_context_len: max(context_lens)
+    * max_seq_len: max(seq_lens)
+    * seq_start_loc: start idx of each sequence
+    * max_encoder_seq_len: encoder seq_lens list, as tensor
+    '''
+    seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
+    context_lens_tensor = maybe_make_int_tensor(context_lens, device)
+    max_context_len = maybe_max(context_lens)
+    max_seq_len = maybe_max(seq_lens)
+
+    encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
+    max_encoder_seq_len = (None if encoder_seq_lens is None else
+                           max(encoder_seq_lens))
+
+    seq_start_loc = None
+
+    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
+            seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len)
+
+
+def make_kv_cache(num_blocks: int,
+                  num_heads: int,
+                  head_size: int,
+                  block_size: int,
+                  device: Union[torch.device, str],
+                  default_val: float = 0.0) -> torch.Tensor:
+    '''
+    Create a fake KV cache.
+
+    Arguments:
+
+    * num_blocks: number of blocks in the KV cache
+    * num_heads: number of attention heads
+    * head_size: head dimension
+    * block_size: number of offsets within a block
+    * device: CPU or CUDA device
+    * default_val: initialization value for KV cache elements
+
+    Returns:
+
+    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+    '''
+
+    kv_cache = torch.rand(
+        (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    if default_val is not None:
+        kv_cache[:, :, :] = default_val
+    return kv_cache
+
+
+def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
+    '''
+    Compute the minimum number of blocks required to hold num_tokens tokens,
+    given block_size
+    '''
+    return (num_tokens + block_size) // block_size
+
+
+def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
+    return maybe_make_long_tensor([], device)
+
+
+def make_empty_block_tables_tensor(device: Union[torch.device, str]):
+    return torch.tensor([], device=device)
+
+
+def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
+                       device: Union[torch.device, str]):
+    '''
+    Split a slot mapping into valid prefill- and decode-phase slot mappings.
+
+    Context:
+    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
+      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
+      for all N prompts (N tokens total); the resultant sequence lengths 
+      after decode would be {K_i + 1 for i \\in [0,N)}
+    * The test you want to do requires (1) having the prefill slot mapping 
+      for all tokens present during prefill, the number of which is 
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+      decoded tokens
+    
+    This function consumes a single 1D slot mapping, which is the 
+    concatenation of N slot mappings each of length K_i + 1 (corresponding
+    to the  sequence lengths after decode), with a total length of
+    P = \\sum_i{K_i + 1} = M + N
+
+    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
+    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    decoded token's mapping.)
+
+    The N excised entries are appended to obtain the decode-phase slot mapping
+
+    Arguments:
+
+    * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N
+      post-decode sequences
+    * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the 
+      description above)
+    * device: cuda, cpu, etc.
+
+    Returns:
+
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+      reflecting all N prefill prompts
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+      all N decoded tokens
+    '''
+
+    prefill_slot_mapping = []
+    decode_slot_mapping = []
+
+    base_idx = 0
+    for seq_len in seq_lens:
+        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
+                                                                seq_len - 1)])
+        decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
+        base_idx += seq_len
+
+    return (maybe_make_long_tensor(prefill_slot_mapping, device),
+            maybe_make_long_tensor(decode_slot_mapping, device))
+
+
+def make_block_tables_slot_mapping(
+        block_size: int,
+        seq_lens: List[int],
+        device: Union[torch.device, str],
+        block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]:
+    '''
+    Construct fake block tables & slot mappings.
+
+    For a sequence with num_tokens tokens the minimum number
+    of required KV cache blocks is
+
+    num_blocks = (num_tokens + block_size) // block_size
+
+    Then the minimum KV cache size in blocks is
+
+    total_cache_blocks = sum(num_blocks for all seqs) 
+
+    Then, the blocktable mapping counts downward from
+
+    block_base_addr + total_cache_blocks
+
+    to
+
+    block_base_addr
+    
+
+    The constructed block-tables and slot-mapping are sized to the
+    lengths of the sequences in their entirety (as reflected by seq_lens),
+    i.e. the total of prefill prompt tokens + decoded tokens.
+
+    Arguments:
+
+    * block_size: number of offsets per block
+    * seq_lens: list of token-counts for each sequence
+    * block_base_addr: the block table base address
+    * device: CPU or CUDA device
+
+    Return:
+
+    * block_tables_tensor: block table for sequence   
+    * slot_mapping_list: slot mapping for sequence
+    * max_block_idx: the highest block address within this block table
+    '''
+
+    # Provision minimum number of KV cache blocks
+    num_blocks_list = [
+        _num_tokens_to_min_blocks(num_tokens, block_size)
+        for num_tokens in seq_lens
+    ]
+    max_block_table_len = max(num_blocks_list)
+    block_table_pad_tokens = 10
+
+    block_tables = []
+    slot_mapping_list = []
+    # Compute uppermost address of block table
+    total_cache_blocks = sum(num_blocks_list)
+    block_base_idx = block_base_addr + total_cache_blocks
+    max_block_idx = block_base_idx
+    for sdx, num_tokens in enumerate(seq_lens):
+        num_blocks = num_blocks_list[sdx]
+        block_table = list(
+            range(block_base_idx, block_base_idx - num_blocks, -1))
+        for idx in range(num_tokens):
+            mapping_value = (
+                idx % block_size) + block_table[idx // block_size] * block_size
+            slot_mapping_list.append(mapping_value)
+
+        block_base_idx -= num_blocks
+        block_tables.append(block_table)
+
+    block_tables_tensor = make_tensor_with_pad(
+        block_tables,
+        max_len=max_block_table_len + block_table_pad_tokens,
+        pad=0,
+        dtype=torch.int,
+        device=device,
+    )
+
+    return (block_tables_tensor, slot_mapping_list, max_block_idx)
+
+
+def make_test_metadata(
+    attn_backend: AttentionBackend,
+    is_prompt: bool,
+    seq_lens: Optional[List[int]],
+    decoder_test_params: Optional[PhaseTestParameters],
+    device: Union[torch.device, str],
+    encoder_test_params: Optional[PhaseTestParameters] = None,
+    cross_test_params: Optional[PhaseTestParameters] = None
+) -> AttentionMetadata:
+    '''
+    Construct fake attention metadata for a given test phase
+    (prefill-phase or decode-phase).
+
+    encoder_test_params and cross_test_params arguments allow encoder
+    attention and enc/dec cross-attention (respectively) to use distinct
+    metadata values from decoder self-attention (decoder_test_params.)
+    
+    if encoder_test_params and cross_test_params are None, the attention
+    metadata will support decoder-only scenario.
+
+    Assumptions:
+
+    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
+
+    Arguments:
+
+    * attn_backend: Backend for sourcing attention kernels
+    * is_prompt: prefill if True, o/w decode
+    * seq_lens: list of token counts for each sequence
+    * decoder_test_params: decoder self-attention test params; 
+                           this function requires
+                           kv_mmap (memory mapping) field
+    * device: CPU or CUDA device
+    * encoder_test_params: encoder attention test params;
+                           this function requires encoder query
+                           sequence lengths field. If None,
+                           encoder query sequence lengths are
+                           treated as None
+    * cross_test_params: enc/dec cross-attention test params;
+                         this function requires kv_mmap field.
+                         If None, KV cache memory map data
+                         structures are treated as None
+
+    Return:
+
+    * AttentionMetadata structure
+    '''
+
+    # Decoder self-attention memory mapping
+    # decoder_test_params is None signals encoder-only
+    # scenario, so kv_mmap is None
+    kv_mmap = (None
+               if decoder_test_params is None else decoder_test_params.kv_mmap)
+
+    # This function constructs metadata assuming no chunked prefill,
+    # i.e. 100% prefill tokens or 100% decode tokens
+    #
+    # - If is_prompt, num_prefills_or_decodes is the number of prefills
+    #   and num_prefill_or_decode_tokens is the number of prefill tokens
+    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
+    #   and num_prefill_or_decode_tokens is the number of decode tokens
+    #
+    # seq_lens is None signals encoder-only
+    # scenario, in which case num_prefills_or_decodes and
+    # num_prefill_or_decode_tokens are unused
+    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
+
+    num_prefill_or_decode_tokens = (None if seq_lens is None else (
+        sum(seq_lens) if is_prompt else len(seq_lens)))
+
+    # Seems for non-prefix-caching scenarios context_lens
+    # is never needed
+    context_lens = None
+
+    if encoder_test_params is None:
+        encoder_seq_lens = None
+        num_encoder_tokens = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract encoder input sequence lengths
+        assert encoder_test_params.packed_qkvo.packed_qkv is not None
+        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+        num_encoder_tokens = (None if encoder_seq_lens is None else
+                              (sum(encoder_seq_lens)))
+
+    if cross_test_params is None:
+        cross_kv_mmap = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract *cross-attention* slot_mapping and block table
+        #   (kv_mmap)
+        cross_kv_mmap = cross_test_params.kv_mmap
+
+    if is_prompt:
+        # Prefill-phase scenario
+
+        num_prefills = num_prefills_or_decodes
+        num_prefill_tokens = num_prefill_or_decode_tokens
+        num_decode_tokens = 0
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            _,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+
+        return attn_backend.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
+            max_decode_seq_len=0,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+    else:  # not is_prompt
+        # Decode-phase scenario
+
+        assert kv_mmap is not None
+        assert num_prefill_or_decode_tokens is not None
+        assert seq_lens is not None
+
+        num_prefills = 0
+        num_prefill_tokens = 0
+        num_decode_tokens = num_prefill_or_decode_tokens
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            _,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+
+        return attn_backend.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=kv_mmap.slot_mapping,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=max(seq_lens),
+            context_lens_tensor=context_lens_tensor,
+            block_tables=kv_mmap.block_tables,
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+
+def assert_actual_matches_ideal(test_params: PhaseTestParameters,
+                                output_under_test: torch.Tensor) -> None:
+    '''
+    Assert that observed output matches the ideal output
+    contained in the test parameters data structure.
+
+    Arguments:
+
+    * test_params: Test parameters including packed ideal output
+    * output_under_test: actually observed output value
+    '''
+    ideal_output = test_params.packed_qkvo.ideal_output
+    assert torch.allclose(ideal_output,
+                          output_under_test.view_as(ideal_output))
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4eab73a71071c..0bcae5b0c96dc 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -159,13 +159,21 @@ def dummy_model_gate_up() -> nn.Module:
 
 
 @pytest.fixture(scope="session")
-def sql_lora_files():
-    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+def sql_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "yard1/llama-2-7b-sql-lora-test"
+
+
+@pytest.fixture(scope="session")
+def sql_lora_files(sql_lora_huggingface_id):
+    return snapshot_download(repo_id=sql_lora_huggingface_id)
 
 
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
-    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+    # Note: this module has incorrect adapter_config.json to test
+    # https://github.com/vllm-project/vllm/pull/5909/files.
+    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 2e51e95a38f2e..7207af6b1a4b3 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -475,10 +475,10 @@ def _pretest():
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=linear.weight,
+            lm_head=linear,
             embedding_bias=None)
 
-        original_weight = linear.weight.clone()
+        original_lm_head = deepcopy(linear)
 
         linear.weight[logits_processor.
                       org_vocab_size:logits_processor.org_vocab_size +
@@ -490,7 +490,7 @@ def _pretest():
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
-                                                  embedding=linear.weight,
+                                                  lm_head=linear,
                                                   embedding_bias=None)
             result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
             result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
@@ -519,11 +519,11 @@ def _pretest():
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)[:, :vocab_size]
         expected_result = logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index b50784a205af7..389a3ccbc17ec 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -29,7 +29,7 @@ def _create_lora_request(lora_id, long_context_infos):
     context_len = long_context_infos[lora_id]["context_length"]
     scaling_factor = context_len_to_scaling_factor[context_len]
     return LoRARequest(context_len, lora_id,
-                       long_context_infos[lora_id]["lora"],
+                       long_context_infos[lora_id]["lora"], None,
                        4096 * scaling_factor)
 
 
@@ -92,11 +92,10 @@ def batched_generate(
     for input in inputs:
         prompt, sampling_param, lora_req = input
         # Add requests to the engine and run the engine
-        llm._validate_and_add_requests(
-            prompt,
-            sampling_param,
-            lora_request=lora_req,
-        )
+        llm._validate_and_add_requests(prompt,
+                                       sampling_param,
+                                       lora_request=lora_req,
+                                       prompt_adapter_request=None)
 
     outputs = llm._run_engine(use_tqdm=True)
     return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
new file mode 100644
index 0000000000000..e2daf9d135113
--- /dev/null
+++ b/tests/lora/test_lora_huggingface.py
@@ -0,0 +1,39 @@
+from typing import List
+
+import pytest
+
+from vllm.lora.models import LoRAModel
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+# Provide absolute path and huggingface lora ids
+lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+
+
+@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
+def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
+    lora_name = request.getfixturevalue(lora_fixture_name)
+    supported_lora_modules = LlamaForCausalLM.supported_lora_modules
+    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
+    embedding_modules = LlamaForCausalLM.embedding_modules
+    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
+    expected_lora_modules: List[str] = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    lora_path = get_adapter_absolute_path(lora_name)
+
+    # lora loading should work for either absolute path and hugggingface id.
+    lora_model = LoRAModel.from_local_checkpoint(
+        lora_path,
+        expected_lora_modules,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules)
+
+    # Assertions to ensure the model is loaded correctly
+    assert lora_model is not None, "LoRAModel is not loaded correctly"
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 51a56b121ae2c..7bff9e1fbcdcc 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -127,37 +127,37 @@ def test_lora_model_manager(dist_init, dummy_model):
         model, 2, 2, 2,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_lora(model_lora1)
-    assert not manager.activate_lora(1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_lora(model_lora2)
-    assert not manager.activate_lora(2)
-    assert manager.add_lora(model_lora3)
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
     with pytest.raises(ValueError):
-        assert manager.activate_lora(3)
+        assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_lora(model_lora2.id)
+    assert manager.remove_adapter(model_lora2.id)
     assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_lora(model_lora2.id)
-    assert manager.remove_lora(model_lora1.id)
-    assert not manager.remove_lora(model_lora1.id)
-    assert manager.add_lora(model_lora1)
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] is None
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] is None
-    assert manager.activate_lora(2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
 
@@ -173,42 +173,70 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
         model, 2, 2, 2,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_lora(model_lora1)
-    assert not manager.activate_lora(1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_lora(model_lora2)
-    assert not manager.activate_lora(2)
-    assert manager.add_lora(model_lora3)
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert manager.activate_lora(3)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_lora(model_lora2.id)
+    assert manager.remove_adapter(model_lora2.id)
     assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_lora(model_lora2.id)
-    assert manager.remove_lora(model_lora1.id)
-    assert not manager.remove_lora(model_lora1.id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 1
-    assert manager.add_lora(model_lora2)
-    assert manager.deactivate_lora(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.deactivate_adapter(3)
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_lora(2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_lora(3)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 3
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.deactivate_adapter(2)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.pin_adapter(3)
+    assert manager.pin_adapter(1)
+    with pytest.raises(RuntimeError):
+        assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    with pytest.raises(RuntimeError):
+        assert manager.activate_adapter(2)
+
+    assert manager.deactivate_adapter(3)
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.remove_adapter(3)
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(3)
 
 
 def test_lru_lora_model_manager(dist_init, dummy_model):
@@ -228,132 +256,169 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert all(x is None for x in manager.lora_index_to_id)
 
     # Add up to capacity
-    assert manager.add_lora(model_lora1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(1)
-    assert manager.activate_lora(2)
+    assert manager.add_adapter(model_lora1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(1)
+    assert manager.activate_adapter(2)
 
-    assert set(manager.list_loras()) == {1, 2}
+    assert set(manager.list_adapters()) == {1, 2}
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
 
     # Add over capacity
-    assert manager.add_lora(model_lora3)
-    assert manager.add_lora(model_lora4)
-    assert manager.activate_lora(3)
-    assert manager.activate_lora(4)
+    assert manager.add_adapter(model_lora3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(3)
+    assert manager.activate_adapter(4)
 
-    assert set(manager.list_loras()) == {3, 4}
+    assert set(manager.list_adapters()) == {3, 4}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 4
 
     # Add 3 again to move it to the top and then add 2
     # should return false since it's in already
-    assert not manager.add_lora(model_lora3)
-    assert not manager.activate_lora(3)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora3)
+    assert not manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
 
-    assert set(manager.list_loras()) == {3, 2}
+    assert set(manager.list_adapters()) == {3, 2}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
 
     # Remove manually
-    assert manager.remove_lora(3)
-    assert not manager.remove_lora(3)
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
 
-    assert set(manager.list_loras()) == {2}
+    assert set(manager.list_adapters()) == {2}
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 2
 
-    assert manager.add_lora(model_lora3)
-    assert manager.activate_lora(3)
-    assert manager.add_lora(model_lora4)
-    assert manager.activate_lora(4)
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
 
-    assert set(manager.list_loras()) == {3, 4}
+    assert set(manager.list_adapters()) == {3, 4}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 4
 
-    assert manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == {4}
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {4}
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 4
 
-    assert manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == set()
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
-    assert not manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == set()
+    assert not manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
+    # pinning
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+    assert set(manager.list_adapters()) == {3, 4}
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(1)
+    assert manager.pin_adapter(3)
+    # Remove manually
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
+
+    assert set(manager.list_adapters()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.add_adapter(model_lora1)
+    assert manager.pin_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {1}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] is None
+
+    with pytest.raises(RuntimeError):
+        assert manager.remove_oldest_adapter()
+
+    assert set(manager.list_adapters()) == {1}
+
 
-def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
-                                       sql_lora_files):
+def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                          sql_lora_files):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_lora_manager = LRUCacheWorkerLoRAManager(
+    worker_adapter_manager = LRUCacheWorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("3", 3, sql_lora_files),
         LoRARequest("4", 4, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files),
         LoRARequest("5", 5, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("6", 6, sql_lora_files),
         LoRARequest("7", 7, sql_lora_files),
         LoRARequest("8", 8, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
+    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_lora_manager.set_active_loras([
+        worker_adapter_manager.set_active_adapters([
             LoRARequest("10", 10, sql_lora_files),
             LoRARequest("11", 11, sql_lora_files),
             LoRARequest("12", 12, sql_lora_files),
@@ -362,68 +427,69 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
-def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
-                             sql_lora_files):
+def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                sql_lora_files):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_lora_manager = WorkerLoRAManager(
+    worker_adapter_manager = WorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("3", 3, sql_lora_files),
         LoRARequest("4", 4, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 3, 4}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files),
         LoRARequest("5", 5, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
+    assert worker_adapter_manager.list_adapters() == {1}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("6", 6, sql_lora_files),
         LoRARequest("7", 7, sql_lora_files),
         LoRARequest("8", 8, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {6, 7, 8}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
+    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_lora_manager.set_active_loras([
+        worker_adapter_manager.set_active_adapters([
             LoRARequest("10", 10, sql_lora_files),
             LoRARequest("11", 11, sql_lora_files),
             LoRARequest("12", 12, sql_lora_files),
@@ -461,8 +527,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
 
     assert isinstance(model.get_submodule("gate_up_proj"),
                       MergedColumnParallelLinearWithLoRA)
-    assert manager.add_lora(model_lora)
-    assert manager.add_lora(model_lora1)
+    assert manager.add_adapter(model_lora)
+    assert manager.add_adapter(model_lora1)
 
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index e7e7724fcec56..b5b4a79eb9567 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -40,14 +40,14 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                    enable_lora=True,
                    max_num_seqs=16,
                    max_loras=4,
+                   distributed_executor_backend="ray",
                    tensor_parallel_size=tp_size)
 
     expected_lora_output = [
         "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
         "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
-
     assert do_sample(llm, mixtral_lora_files,
                      lora_id=1) == expected_lora_output
     assert do_sample(llm, mixtral_lora_files,
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index 110c9b243507d..dbeb16cb21ad3 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -111,6 +111,7 @@ def _lora_ref_impl(
     36864,
     43264,
     49152,
+    49408,
     60544,
     60672,
     64000,
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 4ff9715b4ca8d..db02bacdb6439 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,9 +1,12 @@
 from collections import OrderedDict
+from unittest.mock import patch
 
 import pytest
+from huggingface_hub.utils import HfHubHTTPError
 from torch import nn
 
-from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
+from vllm.lora.utils import (get_adapter_absolute_path,
+                             parse_fine_tuned_lora_name, replace_submodule)
 from vllm.utils import LRUCache
 
 
@@ -182,3 +185,55 @@ def test_lru_cache():
     assert 2 in cache
     assert 4 in cache
     assert 6 in cache
+
+
+# Unit tests for get_adapter_absolute_path
+@patch('os.path.isabs')
+def test_get_adapter_absolute_path_absolute(mock_isabs):
+    path = '/absolute/path/to/lora'
+    mock_isabs.return_value = True
+    assert get_adapter_absolute_path(path) == path
+
+
+@patch('os.path.expanduser')
+def test_get_adapter_absolute_path_expanduser(mock_expanduser):
+    # Path with ~ that needs to be expanded
+    path = '~/relative/path/to/lora'
+    absolute_path = '/home/user/relative/path/to/lora'
+    mock_expanduser.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('os.path.exists')
+@patch('os.path.abspath')
+def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
+    # Relative path that exists locally
+    path = 'relative/path/to/lora'
+    absolute_path = '/absolute/path/to/lora'
+    mock_exist.return_value = True
+    mock_abspath.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('huggingface_hub.snapshot_download')
+@patch('os.path.exists')
+def test_get_adapter_absolute_path_huggingface(mock_exist,
+                                               mock_snapshot_download):
+    # Hugging Face model identifier
+    path = 'org/repo'
+    absolute_path = '/mock/snapshot/path'
+    mock_exist.return_value = False
+    mock_snapshot_download.return_value = absolute_path
+    assert get_adapter_absolute_path(path) == absolute_path
+
+
+@patch('huggingface_hub.snapshot_download')
+@patch('os.path.exists')
+def test_get_adapter_absolute_path_huggingface_error(mock_exist,
+                                                     mock_snapshot_download):
+    # Hugging Face model identifier with download error
+    path = 'org/repo'
+    mock_exist.return_value = False
+    mock_snapshot_download.side_effect = HfHubHTTPError(
+        "failed to query model info")
+    assert get_adapter_absolute_path(path) == path
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index c1164739eee31..42b15cd6c458e 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,11 +1,13 @@
 from typing import List
 
 import pytest
+import ray
 from prometheus_client import REGISTRY
 
 from vllm import EngineArgs, LLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 
 MODELS = [
@@ -39,7 +41,7 @@ def test_metric_counter_prompt_tokens(
         vllm_prompt_token_count = sum(prompt_token_counts)
 
         _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
             **stat_logger.labels)._value.get()
 
@@ -64,7 +66,7 @@ def test_metric_counter_generation_tokens(
                      gpu_memory_utilization=0.4) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
             **stat_logger.labels)._value.get()
         vllm_generation_count = 0
@@ -92,7 +94,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                      disable_log_stats=False,
                      gpu_memory_utilization=0.3,
                      served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
@@ -168,14 +170,63 @@ def test_engine_log_metrics_regression(
     assert_metrics(engine, disable_log_stats, len(example_prompts))
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_metric_spec_decode(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    k = 5
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4,
+                     speculative_model=model,
+                     num_speculative_tokens=k,
+                     use_v2_block_manager=True) as vllm_model:
+
+        # Force log interval to be 0 to catch all metrics.
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger.local_interval = 0
+
+        # Note that the purpose of this test is to verify spec decode
+        # metrics instead of functional correctness, so the expected values
+        # are intended to be loose.
+        metric_name_to_expected_fn = {
+            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
+            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
+            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
+            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
+            "counter_spec_decode_num_emitted_tokens":
+            lambda v: 0 <= v <= k + 1,
+        }
+
+        # Use one request to better inspect the metrics.
+        prompts = example_prompts[:1]
+
+        _ = vllm_model.generate_greedy(prompts, max_tokens)
+        for metric_name, is_expected in metric_name_to_expected_fn.items():
+            metric_val = getattr(
+                stat_logger.metrics,
+                metric_name).labels(**stat_logger.labels)._value.get()
+            assert is_expected(metric_val), (
+                f"the value of metric {metric_name} ({metric_val}) "
+                "does not meet expectation")
+
+
 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
         with pytest.raises(AttributeError):
-            _ = engine.stat_logger
+            _ = engine.stat_loggers
     else:
-        assert (engine.stat_logger
-                is not None), "engine.stat_logger should be set"
+        assert (engine.stat_loggers
+                is not None), "engine.stat_loggers should be set"
         # Ensure the count bucket of request-level histogram metrics matches
         # the number of requests as a simple sanity check to ensure metrics are
         # generated
@@ -192,3 +243,55 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                                                      labels)
             assert (
                 metric_value == num_requests), "Metrics should be collected"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+def test_engine_log_metrics_ray(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is quite weak - it only checks that we can use
+    # RayPrometheusStatLogger without exceptions.
+    # Checking whether the metrics are actually emitted is unfortunately
+    # non-trivial.
+
+    # We have to run in a Ray task for Ray metrics to be emitted correctly
+    @ray.remote(num_gpus=1)
+    def _inner():
+
+        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
+
+            def __init__(self, *args, **kwargs):
+                self._i = 0
+                super().__init__(*args, **kwargs)
+
+            def log(self, *args, **kwargs):
+                self._i += 1
+                return super().log(*args, **kwargs)
+
+        engine_args = EngineArgs(
+            model=model,
+            dtype=dtype,
+            disable_log_stats=False,
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        logger = _RayPrometheusStatLogger(
+            local_interval=0.5,
+            labels=dict(model_name=engine.model_config.served_model_name),
+            max_model_len=engine.model_config.max_model_len)
+        engine.add_logger("ray", logger)
+        for i, prompt in enumerate(example_prompts):
+            engine.add_request(
+                f"request-id-{i}",
+                prompt,
+                SamplingParams(max_tokens=max_tokens),
+            )
+        while engine.has_unfinished_requests():
+            engine.step()
+        assert logger._i > 0, ".log must be called at least once"
+
+    ray.get(_inner.remote())
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index ef78283731775..c3e48b56ee58f 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -7,6 +7,8 @@
 import pytest
 import torch
 
+from .utils import check_outputs_equal
+
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
     # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
@@ -40,13 +42,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/test_chameleon.py b/tests/models/test_chameleon.py
new file mode 100644
index 0000000000000..6e775da24d14e
--- /dev/null
+++ b/tests/models/test_chameleon.py
@@ -0,0 +1,102 @@
+import re
+from typing import List, Optional, Type
+
+import pytest
+
+from vllm.multimodal.utils import rescale_image_size
+
+from ..conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
+
+models = ["facebook/chameleon-7b"]
+
+
+#TODO (ywang96): Add correctness test when chameleon is
+# available on transformers.
+def run_test(
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Test if the model can generate text given 
+    a batch of images and prompts.
+
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+
+        for prompts, images in inputs_per_image:
+            vllm_outputs = vllm_model.generate_greedy(prompts,
+                                                      max_tokens,
+                                                      images=images)
+            for i in range(len(vllm_outputs)):
+
+                # format prompt back to original
+                replacements = {
+                    "<racm3:break>": "",
+                    "<eoss>": "",
+                    "<reserved08706>": ""
+                }
+                pattern = '|'.join(replacements.keys())
+                vllm_result = re.sub(
+                    pattern,
+                    lambda match: replacements[match.group(0)],  #noqa B023
+                    vllm_outputs[i][1])
+                vllm_result = vllm_result.replace("<image>", "", 1023)
+                assert vllm_result[:len(prompts[i])] == prompts[i]
+
+                # assert at least 10 new characters are generated
+                # (to take stop token into account)
+                assert len(vllm_outputs[i][1]) - len(prompts[i]) > 10
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, image_assets, model, size_factors, dtype: str,
+                max_tokens: int) -> None:
+    run_test(
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
new file mode 100644
index 0000000000000..da47d5f3f3d23
--- /dev/null
+++ b/tests/models/test_compressed_tensors.py
@@ -0,0 +1,52 @@
+"""Compares vllm vs sparseml for compressed-tensors
+
+Note: vllm and sparseml do not have bitwise correctness, 
+so in this test, we just confirm that the top selected 
+tokens of the are in the top 5 selections of each other.
+"""
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from .utils import check_logprobs_close
+
+MODELS = [
+    # No bias
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
+    # Bias
+    "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+]
+
+MAX_TOKENS = 32
+NUM_LOGPROBS = 5
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("compressed-tensors"),
+    reason="compressed-tensors is not supported on this machine type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    vllm_runner,
+    hf_runner,
+    example_prompts,
+    model_name,
+) -> None:
+    # Run sparseml.
+    with hf_runner(model_name=model_name,
+                   is_sparseml_model=True) as sparseml_model:
+
+        sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    # Run vllm.
+    with vllm_runner(model_name=model_name) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=sparseml_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="sparseml",
+        name_1="vllm",
+    )
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
new file mode 100644
index 0000000000000..25f63a3d64d0e
--- /dev/null
+++ b/tests/models/test_fuyu.py
@@ -0,0 +1,143 @@
+from typing import List, Optional, Tuple, Type
+
+import pytest
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What's the content of the image?\n",
+    "cherry_blossom":
+    "What is the season?\n",
+})
+
+models = ["adept/fuyu-8b"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]]):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=2560,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
+            for prompts, vllm_images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [0.25],
+        # Single-scale, batched
+        [0.25, 0.25, 0.25],
+        # Multi-scale
+        [0.25, 0.2, 0.15],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
new file mode 100644
index 0000000000000..774f2d9d9cdbc
--- /dev/null
+++ b/tests/models/test_jamba.py
@@ -0,0 +1,176 @@
+import pytest
+
+from tests.models.utils import check_outputs_equal
+from vllm.worker.model_runner import _get_graph_batch_size
+
+MODELS = ["ai21labs/Jamba-tiny-random"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    assert dtype == "float"
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    for_loop_outputs = []
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for prompt in example_prompts:
+            for_loop_outputs.append(
+                vllm_model.generate_greedy([prompt], max_tokens)[0])
+
+        batched_outputs = vllm_model.generate_greedy(example_prompts,
+                                                     max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is for verifying that mamba cache is padded to CG captured
+    # batch size. If it's not, a torch RuntimeError will be raised because
+    # tensor dimensions aren't compatible
+    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models_preemption_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # Tests that outputs are identical with and w/o preemtions (recompute)
+    assert dtype == "float"
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = True
+        preempt_vllm_outputs = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = False
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=preempt_vllm_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="vllm_preepmtions",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Jamba inner state management doesn't
+    # collapse in case where the number of incoming requests and
+    # finished_requests_ids is larger than the maximum mamba block capacity.
+    # This could generally happen due to the fact that Jamba does support
+    # statelessness mechanism where it can cleanup new incoming requests in
+    # a single step.
+    try:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_state_cleanup(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Jamba state is cleaned up between
+    # steps, If its not cleaned, an error would be expected.
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index b41c69f72b052..79ab58c364f64 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,110 +1,149 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
-    "<image>\nUSER: What is the season?\nASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
-
-
-def iter_llava_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (336, 336): 576,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
-
-
-model_and_vl_config = [
-    *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
-    input_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
+IMAGE_TOKEN_ID = 32000
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
-    ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "")
+models = ["llava-hf/llava-1.5-7b-hf"]
 
-    return hf_input_ids, hf_output_str
 
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
+    images = [asset.pil_image for asset in image_assets]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
-    with vllm_runner(model_id,
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
                      dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 0eca5cb5330c8..2f200c13ea001 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,11 +1,15 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
+from vllm.model_executor.models.llava_next import (
+    get_llava_next_image_feature_size)
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
@@ -14,110 +18,115 @@
     "The assistant gives helpful, detailed, and polite answers to the human's "
     "questions.")
 
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
-    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
-    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
-
-
-def iter_llava_next_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (336, 336): 1176,
-        (672, 672): 2928,
-        (1344, 336): 1944,
-        (336, 1344): 1890,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
-
-
-model_and_vl_config = [
-    *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
-    input_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
+    "cherry_blossom":
+    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
+})
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
+IMAGE_TOKEN_ID = 32000
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
-    ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, " ")
 
-    return hf_input_ids, hf_output_str
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
 
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+    ]
 
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
-
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
-
-    with vllm_runner(
-            model_id,
-            dtype=dtype,
-            # should be greater than image_feature_size
-            max_model_len=4096,
-            enforce_eager=True,
-            **vlm_config.as_cli_args_dict(),
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
+                                                         (183, 488, 776)])
+def test_image_feature_size(height_and_width_and_result):
+    height, width, result = height_and_width_and_result
+    config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    assert get_llava_next_image_feature_size(config,
+                                             input_height=height,
+                                             input_width=width) == result
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 71238d6909a69..4cd2cb665c8f0 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -7,6 +7,8 @@
 """
 import pytest
 
+from .utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
     "gpt2",
@@ -17,6 +19,7 @@
     "stabilityai/stablelm-3b-4e1t",
     # "allenai/OLMo-1B",  # Broken
     "bigcode/starcoder2-3b",
+    "google/gemma-1.1-2b-it",
 ]
 
 
@@ -40,13 +43,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
new file mode 100644
index 0000000000000..e1c39ee6fecb6
--- /dev/null
+++ b/tests/models/test_paligemma.py
@@ -0,0 +1,164 @@
+import os
+from typing import List, Optional, Tuple, Type
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_hip
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "caption es",
+    "cherry_blossom":
+    "What is in the picture?",
+})
+
+IMAGE_TOKEN_ID = 257152
+
+models = ["google/paligemma-3b-mix-224"]
+
+# ROCm Triton FA can run into compilation issues with these models due to,
+# excessive use of shared memory. Use other backends in the meantime.
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if is_hip():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+    ]
+
+    hf_output_str = output_str
+
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [
+    pytest.param(
+        "float",
+        marks=pytest.mark.skipif(
+            is_hip(),
+            reason=
+            "ROCm FA does not yet fully support 32-bit precision on PaliGemma")
+    ), "half"
+])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 23454759827d5..9da25ab8d78fe 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,126 +1,166 @@
-from typing import List, Tuple
+import os
+import re
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
-from vllm.utils import is_cpu
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu, is_hip
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
     "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-]
+})
 
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+models = ["microsoft/Phi-3-vision-128k-instruct"]
 
 
-def iter_phi3v_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (1008, 1344): 1921,
-    }
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
 
-    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32044,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
 
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
 
-model_and_vl_config = [
-    *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"),
-]
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
 
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
-    input_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
-
-    hf_input_ids = [
-        input_id if input_id != image_token_id else 0
-        for idx, input_id in enumerate(input_ids)
-    ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "") \
-        .replace("<s>", " ").replace("<|user|>", "") \
-        .replace("<|end|>\n<|assistant|>", " ")
-
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 target_dtype = "half"
 if is_cpu():
     target_dtype = "bfloat16"
 
-
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-# Since we use _attn_implementation="eager" for hf_runner, here is
-# numeric difference for longer context and test can't pass
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if is_hip():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
+            for prompts, vllm_images in inputs_per_image
+        ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model_id, dtype=dtype,
+    with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS,
-            max_tokens,
-            images=hf_images,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
-
-    vllm_image_prompts = [
-        p.replace("<|image_1|>",
-                  "<|image|>" * vlm_config.image_feature_size + "<s>")
-        for p in HF_IMAGE_PROMPTS
-    ]
-
-    with vllm_runner(model_id,
-                     max_model_len=2048,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3e49dfb331176..425f57ef9b966 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,7 +1,58 @@
-def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
-    """Compare the logprobs of two sequences generated by different models, 
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from vllm.sequence import SampleLogprobs
+
+TokensText = Tuple[List[int], str]
+
+
+def check_outputs_equal(
+    *,
+    outputs_0_lst: Sequence[TokensText],
+    outputs_1_lst: Sequence[TokensText],
+    name_0: str,
+    name_1: str,
+):
+    """
+    Compare the two sequences generated by different models, 
+    which should be equal.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        # The text and token outputs should exactly match
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+
+        assert output_str_0 == output_str_1, fail_msg
+        assert output_ids_0 == output_ids_1, fail_msg
+
+
+TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+                                                                    float]],
+                                                          SampleLogprobs]]]
+
+
+def check_logprobs_close(
+    *,
+    outputs_0_lst: Sequence[TokensTextLogprobs],
+    outputs_1_lst: Sequence[TokensTextLogprobs],
+    name_0: str,
+    name_1: str,
+    warn_on_mismatch: bool = True,
+):
+    """
+    Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
     """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
     # Loop through responses to each prompt.
     for prompt_idx, (outputs_0,
                      outputs_1) in enumerate(zip(outputs_0_lst,
@@ -9,21 +60,53 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
         output_ids_0, output_str_0, logprobs_0 = outputs_0
         output_ids_1, output_str_1, logprobs_1 = outputs_1
 
+        if logprobs_0 is None:
+            logprobs_0 = [None] * len(output_ids_0)
+        if logprobs_1 is None:
+            logprobs_1 = [None] * len(output_ids_1)
+
         # Loop through generated tokens.
         for idx, (output_id_0,
                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
 
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
+                logprobs_elem_0 = logprobs_0[idx]
+                logprobs_elem_1 = logprobs_1[idx]
+
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (
-                    f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (
+                fail_msg = (
                     f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
+                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+
+                assert logprobs_elem_0 is not None, fail_msg
+                assert logprobs_elem_1 is not None, fail_msg
+                assert output_id_0 in logprobs_elem_1, fail_msg
+                assert output_id_1 in logprobs_elem_0, fail_msg
+
+                if warn_on_mismatch:
+                    with warnings.catch_warnings():
+                        # This ensures that repeated warnings are shown
+                        # in the output, not just the first occurrence
+                        warnings.simplefilter("always")
+
+                        warnings.warn(fail_msg, stacklevel=2)
 
                 # Break out since sequences will now diverge.
                 break
+        else:
+            if output_str_0 != output_str_1 and warn_on_mismatch:
+                # The token outputs exactly match,
+                # so the text outputs should exactly match as well
+                fail_msg = (f"Test{prompt_idx}:"
+                            f"\n{name_0}:\t{output_str_0!r}"
+                            f"\n{name_1}:\t{output_str_1!r}")
+
+                with warnings.catch_warnings():
+                    # This ensures that repeated warnings are shown
+                    # in the output, not just the first occurrence
+                    warnings.simplefilter("always")
+
+                    warnings.warn(fail_msg, stacklevel=2)
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
new file mode 100644
index 0000000000000..321566ad53a50
--- /dev/null
+++ b/tests/multimodal/test_mapper.py
@@ -0,0 +1,85 @@
+import numpy as np
+import pytest
+from transformers import CLIPImageProcessor, LlavaNextImageProcessor
+
+from vllm.config import ModelConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import rescale_image_size
+
+
+@pytest.mark.parametrize("dtype", ["half", "float"])
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_clip_image_processor(image_assets, dtype, size_factor):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, CLIPImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+
+    for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            {"image": image},
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.parametrize("dtype", ["half", "float"])
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_llava_next_image_processor(image_assets, dtype, size_factor):
+    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
+
+    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, LlavaNextImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+
+    for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
+            {"image": image},
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
deleted file mode 100644
index 51c352361702a..0000000000000
--- a/tests/multimodal/test_processor.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import numpy as np
-import pytest
-from transformers import CLIPImageProcessor, LlavaNextImageProcessor
-
-from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData
-
-from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
-
-
-@pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(hf_images, dtype):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
-
-    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, CLIPImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
-
-    for image in hf_images:
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
-@pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(hf_images, dtype):
-    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
-
-    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, LlavaNextImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=64000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=2928,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
-
-    for image in hf_images:
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.xfail(
-    reason="Example image pixels were not processed using HuggingFace")
-@pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
-
-    for image, tensor in zip(hf_images, vllm_image_tensors):
-        image_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
-        )
-        tensor_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(tensor),
-            model_config=model_config,
-            vlm_config=vlm_config,
-        )
-
-        assert image_result.keys() == tensor_result.keys()
-        for key, image_arr in image_result.items():
-            tensor_arr: np.ndarray = tensor_result[key].numpy()
-
-            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
-            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 5a6395ac9e42a..cd1fc91c29374 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,10 +5,9 @@
 
 import numpy as np
 import pytest
-import pytest_asyncio
 from PIL import Image
 
-from vllm.multimodal.utils import ImageFetchAiohttp
+from vllm.multimodal.utils import async_fetch_image, fetch_image
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
@@ -19,12 +18,9 @@
 ]
 
 
-@pytest_asyncio.fixture(scope="session")
-async def url_images() -> Dict[str, Image.Image]:
-    return {
-        image_url: await ImageFetchAiohttp.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
-    }
+@pytest.fixture(scope="module")
+def url_images() -> Dict[str, Image.Image]:
+    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -41,6 +37,14 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
     return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_http(image_url: str):
+    image_sync = fetch_image(image_url)
+    image_async = await async_fetch_image(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
@@ -68,8 +72,11 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        data_image_sync = fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
-            assert _image_equals(url_image, data_image)
+            assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await async_fetch_image(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
new file mode 100644
index 0000000000000..6528b3009b8c0
--- /dev/null
+++ b/tests/prompt_adapter/test_bloom.py
@@ -0,0 +1,45 @@
+import pytest
+
+import vllm
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "bigscience/bloomz-560m"
+PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+
+
+def do_sample(llm, pa_name: str, pa_id: int):
+
+    prompts = [
+        "Tweet text : @nationalgridus I have no water and the bill is \
+        current and paid. Can you do something about this? Label : ",
+        "Tweet text : @nationalgridus Looks good thanks! Label : "
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0.0,
+                                          max_tokens=3,
+                                          stop_token_ids=[3])
+
+    outputs = llm.generate(prompts,
+                           sampling_params,
+                           prompt_adapter_request=PromptAdapterRequest(
+                               pa_name, pa_id, PA_PATH, 8) if pa_id else None)
+
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_twitter_prompt_adapter(enforce_eager: bool):
+    llm = vllm.LLM(MODEL_PATH,
+                   enforce_eager=enforce_eager,
+                   enable_prompt_adapter=True,
+                   max_prompt_adapter_token=8)
+
+    expected_output = ['complaint', 'no complaint']
+
+    assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
new file mode 100644
index 0000000000000..39a79becdfbb3
--- /dev/null
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -0,0 +1,53 @@
+from vllm import EngineArgs, LLMEngine, SamplingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "bigscience/bloomz-560m"
+pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+pa_path2 = 'swapnilbp/angry_tweet_ptune'
+
+
+def do_sample(engine):
+
+    prompts = [
+        ("Tweet text: I have complaints! Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
+        ("Tweet text: I have no problems Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
+        ("Tweet text: I have complaints! Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3), None),
+        ("Tweet text: I have no problems Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("complain", 3, pa_path, 8)),
+    ]
+
+    request_id = 0
+    results = set()
+    while prompts or engine.has_unfinished_requests():
+        if prompts:
+            prompt, sampling_params, pa_request = prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               prompt_adapter_request=pa_request)
+            request_id += 1
+
+        request_outputs = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                results.add(request_output.outputs[0].text)
+    return results
+
+
+def test_multi_prompt_adapters():
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             max_prompt_adapters=3,
+                             enable_prompt_adapter=True,
+                             max_prompt_adapter_token=8)
+    engine = LLMEngine.from_engine_args(engine_args)
+    expected_output = {
+        ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
+    }
+    assert do_sample(engine) == expected_output
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
new file mode 100644
index 0000000000000..2a5f23f7f92ec
--- /dev/null
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -0,0 +1,61 @@
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
+lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+
+def do_sample(engine):
+
+    prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]"  # noqa: E501
+
+    # first prompt with a prompt adapter and second without adapter
+    prompts = [
+        (prompt_text,
+         SamplingParams(temperature=0.0, max_tokens=100,
+                        stop=["[/assistant]"]),
+         PromptAdapterRequest("hate_speech", 1, pa_path,
+                              8), LoRARequest("sql_test", 1, lora_path)),
+        (prompt_text,
+         SamplingParams(temperature=0.0, max_tokens=100,
+                        stop=["[/assistant]"]), None,
+         LoRARequest("sql_test", 1, lora_path)),
+    ]
+
+    request_id = 0
+    results = set()
+    while prompts or engine.has_unfinished_requests():
+        if prompts:
+            prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               prompt_adapter_request=pa_request,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                results.add(request_output.outputs[0].text)
+    return results
+
+
+def test_lora_prompt_adapter():
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_prompt_adapter=True,
+                             enable_lora=True,
+                             max_num_seqs=60,
+                             max_prompt_adapter_token=8)
+    engine = LLMEngine.from_engine_args(engine_args)
+    result = do_sample(engine)
+
+    expected_output = {
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' "  # noqa: E501
+    }
+    assert result == expected_output
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index aaa366335d196..c5a01b73f4a80 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -6,19 +6,22 @@
 import pytest
 import torch
 
-from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationType)
 
 
 @pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
+     QuantizationType.INT, 2560),
+    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
+     QuantizationType.INT, 2560),
 ])
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
-    model_path, strategy = model_args
+    model_path, strategy, quant_type, shape_0 = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -34,25 +37,33 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
                           CompressedTensorsLinearMethod)
         assert isinstance(down_proj.quant_method,
                           CompressedTensorsLinearMethod)
-
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
 
         assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.weight.dtype is torch.int8
-        assert o_proj.weight.dtype is torch.int8
-        assert gate_up_proj.weight.dtype is torch.int8
+        assert qkv_proj.scheme.is_static_input_scheme
+        expected_type = torch.int8
+
+        assert qkv_proj.weight.dtype is expected_type
+        assert o_proj.weight.dtype is expected_type
+        assert gate_up_proj.weight.dtype is expected_type
 
         if qkv_proj.scheme.strategy == "tensor":
-            assert qkv_proj.weight_scale.shard_splitter is not None
-            assert qkv_proj.weight_scale.logical_widths is not None
+            # Make sure it is a channelwise buffer
+            # After running process_weights_after_loading
+            assert len(qkv_proj.weight_scale.shape) == 2
+            assert qkv_proj.weight_scale.shape[0] == shape_0
+            assert qkv_proj.weight_scale.shape[1] == 1
+        assert qkv_proj.weight_scale.dtype is torch.float32
         assert qkv_proj.input_scale.dtype is torch.float32
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
-        sampling_params = SamplingParams()
-        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
 
 
@@ -69,31 +80,39 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         qkv_proj = layer.self_attn.qkv_proj
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+        assert not qkv_proj.scheme.is_static_input_scheme
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
 
-@pytest.mark.parametrize("w4a16_args", [
-    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
-    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
-])
-def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
-    model, strategy, group = w4a16_args
+
+@pytest.mark.parametrize(
+    "wNa16_args",
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
+    model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
         qkv_proj = layer.self_attn.qkv_proj
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
         assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.group_size == group
+        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == 8
+        assert qkv_proj.weight_packed.pack_factor == pack_factor
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
 
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
@@ -108,6 +127,33 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
         assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
         assert qkv_proj.weight_packed.dtype is torch.int32
 
-        sampling_params = SamplingParams()
-        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+def test_compressed_tensors_fp8(vllm_runner):
+    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
+        assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+        assert qkv_proj.input_scale.dtype is torch.float32
+        assert qkv_proj.weight_scale.dtype is torch.float32
+        # should be scalars after processing
+        assert len(qkv_proj.input_scale.shape) == 0
+        assert len(qkv_proj.weight_scale.shape) == 0
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+def test_compressed_tensors_kv_cache(vllm_runner):
+    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index b63a8d01d6621..d18233fe1aeae 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -44,9 +44,9 @@ class ModelPair:
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
 
     # AUTOAWQ
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
 ]
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 74d21ead042ed..82dc775f8d812 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -6,8 +6,55 @@
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from vllm._custom_ops import scaled_fp8_quant
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
+                                                         Fp8LinearMethod)
+
+MODELS = [
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_id", MODELS)
+def test_model_load_and_run(vllm_runner, model_id: str):
+    with vllm_runner(model_id) as llm:
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
+
+KV_CACHE_MODELS = [
+    # Deprecated AutoFP8 format using .kv_scale
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    # AutoFP8 format using separate .k_scale and .v_scale
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        attn = model.model.layers[0].self_attn.attn
+        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+        # NOTE: it is valid for scales to be 1.0 (default value), but we know
+        # these checkpoints have scales < 1.0
+        assert 0.0 < attn._k_scale < 1.0
+        assert 0.0 < attn._v_scale < 1.0
+
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -18,7 +65,16 @@ def test_load_fp16_model(vllm_runner) -> None:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         fc1 = model.model.decoder.layers[0].fc1
         assert isinstance(fc1.quant_method, Fp8LinearMethod)
-        assert fc1.weight.dtype == torch.float8_e4m3fn
+
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability >= 89:
+            # For GPUs with hardware support, we keep weights in fp8
+            assert fc1.weight.dtype == torch.float8_e4m3fn
+        else:
+            # For GPUs without hardware support, we pack the fp8 weights
+            # for weight-only quantization using Marlin kernels
+            assert fc1.weight.dtype == torch.int32
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -46,7 +102,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
 
     # Dynamic quantization
-    ref_y, inv_scale = scaled_fp8_quant(x, None)
+    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
     ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
 
     # Reference dynamic quantizaton
@@ -54,11 +110,11 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Static quantization
-    y, _ = scaled_fp8_quant(x, inv_scale)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale)
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
new file mode 100644
index 0000000000000..dd9a016807df9
--- /dev/null
+++ b/tests/quantization/test_lm_head.py
@@ -0,0 +1,45 @@
+"""Tests whether gptq models with quantized lm_head can be loaded.
+
+Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
+"""
+from typing import Tuple
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+
+PROMPT = "On the surface of Mars, we found"
+
+MODELS_QUANT = [(
+    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
+    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+
+
+@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+def test_lm_head(
+    vllm_runner,
+    model_lm_head_quant: Tuple[str, bool],
+) -> None:
+    model, lm_head_quantized = model_lm_head_quant
+    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
+
+    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
+                     model_runner.model.lm_head)
+
+    if lm_head_quantized:
+        assert isinstance(
+            lm_head_layer.linear_method,
+            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
+    else:
+        assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
+
+    print(
+        vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                   max_tokens=10)[0][1])
+    del vllm_model
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 29085916afb4d..65bb80ed70c6a 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,6 +1,7 @@
 import torch
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.platforms import current_platform
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
@@ -8,7 +9,7 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not torch.cuda.is_available():
         return False
 
-    capability = torch.cuda.get_device_capability()
+    capability = current_platform.get_device_capability()
     capability = capability[0] * 10 + capability[1]
     return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 02a953da04659..f7bcd4c855799 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -11,7 +11,8 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype",
+                         ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 6dd643bbea2bb..b6330a5e5f7c5 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     high=vocab_size,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
+    generators = [None] * batch_size
 
     rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids)
+                      draft_token_ids, generators)
+
+
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
+@pytest.mark.parametrize("k", [1, 3, 6])
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
+@pytest.mark.parametrize("n_rep", [100])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
+                                   frac_seeded: float, n_rep: int,
+                                   device: str):
+    torch.set_default_device(device)
+    rejection_sampler = RejectionSampler()
+    rejection_sampler.init_gpu_tensors(rank=0)
+
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
+
+    results = []
+    for _ in range(n_rep):
+        generators = [
+            torch.Generator(
+                device=device).manual_seed(i) if seeded_mask[i] else None
+            for i in range(batch_size)
+        ]
+        results.append(
+            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                              draft_token_ids, generators))
+
+    for i in range(batch_size):
+        if seeded_mask[i]:
+            for j in range(1, n_rep):
+                assert torch.equal(results[j][i], results[0][i])
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
         raise AssertionError()
 
     oob_token_ids[0][0] = rogue_token_id
+    generators = [None] * batch_size
 
     with pytest.raises(AssertionError):
         rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids)
+                          draft_token_ids, generators)
 
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -371,11 +417,15 @@ def _estimate_rejection_sampling_pdf(
                                       dtype=torch.int64,
                                       device="cuda").repeat(num_samples, 1)
 
+        # unseeded
+        generators = [None]
+
         # Get output tokens via rejection sampling.
         output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
                                                   bonus_token_ids.to("cuda"),
                                                   draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"))
+                                                  draft_token_ids.to("cuda"),
+                                                  generators)
 
         # Remove bonus tokens
         output_token_ids = output_token_ids[:, :-1].flatten()
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 0aabde6aa8c5c..9572588ce6e53 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -587,7 +587,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     generation_config = GenerationConfig(top_k=top_k,
                                          top_p=top_p,
                                          do_sample=True)
-    warpers = generation_model._get_logits_warper(generation_config)
+    warpers = generation_model._get_logits_warper(generation_config, device)
     assert len(warpers) == 2  # top_p and top_k
 
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 87cf37bc926bc..4f6290795b2ce 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -52,6 +52,19 @@ def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
     return draft_token_ids
 
 
+def get_acceptance_sampler(
+    posterior_threshold: float = 0.03,
+    posterior_alpha: float = 0.9,
+    disable_bonus_tokens: bool = False,
+    strict_mode: bool = False,
+) -> TypicalAcceptanceSampler:
+    """
+    Initializes and returns a TypicalAcceptanceSampler.
+    """
+    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
+                                    disable_bonus_tokens, strict_mode)
+
+
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@@ -64,7 +77,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
     different combinations of k, vocab_size, batch_size and num devices.
     """
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler()
+    typical_acceptance_sampler = get_acceptance_sampler()
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -76,7 +89,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
     # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids)
+    typical_acceptance_sampler(target_probs,
+                               bonus_token_ids,
+                               draft_probs=None,
+                               draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@@ -94,7 +110,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True)
+    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -125,8 +141,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_probs, bonus_token_ids,
-                                   draft_token_ids)
+        typical_acceptance_sampler(target_probs,
+                                   bonus_token_ids,
+                                   draft_probs=None,
+                                   draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("seed", list(range(10)))
@@ -151,7 +169,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -163,9 +181,11 @@ def test_uniform_target_distribution_accepts_all_tokens(
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # We are using a uniform target probability distribution.
     # For a uniform distribution the entropy is very high and it
     # should lead to all draft tokens being accepted. Verify that.
@@ -203,7 +223,7 @@ def test_temperature_zero_target_distribution(seed: int,
     vocab_size = 30_000
     torch.set_default_device(device)
 
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target probabilities
@@ -224,9 +244,11 @@ def test_temperature_zero_target_distribution(seed: int,
     # 1.0 tokens in the target distribution we will reject all of them and
     # fallback to the greedy sampling for selecting 1 token for each sequence.
     # Verify the same.
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, -1] == -1)
@@ -261,7 +283,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
     batch_size = 4
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # For sequences 0 and 2 set the distribution to a temperature
@@ -277,9 +299,11 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # verify the shape of output_token_ids
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
@@ -326,7 +350,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Create a temperature zero target probability distribution and ensure
@@ -339,9 +363,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -357,9 +383,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
         batch_size, k, vocab_size, zero_temperature_token_ids)
     draft_token_ids = torch.cat(
         (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
@@ -384,7 +412,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target
@@ -402,9 +430,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 1:-1] == -1)
@@ -418,9 +448,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
         posterior_threshold=0.0,
         posterior_alpha=0.0)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -451,7 +483,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 60dfe33f2918b..bd1ea43f0b101 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,6 +1,6 @@
 import asyncio
 from itertools import cycle
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import pytest
 import ray
@@ -11,8 +11,9 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import MultiModalData
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
@@ -91,7 +92,8 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional[MultiModalDataDict] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> List[RequestOutput]:
 
         if prompts is None:
@@ -126,7 +128,9 @@ async def get_output(prompt, sampling_param) -> RequestOutput:
         try:
             for i in range(num_requests):
                 prompt = prompts[i] if prompts is not None else None
-                res = asyncio.run(get_output(prompt, sampling_params))
+                params = sampling_params[i] if isinstance(
+                    sampling_params, Sequence) else sampling_params
+                res = asyncio.run(get_output(prompt, params))
                 outputs.append(res)
         finally:
             ray.shutdown()
@@ -160,6 +164,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
     }
     test_name = request.node.name
 
+    model = kwargs["model"]
+    draft_model = kwargs.get("speculative_model", None)
+    same_draft_target_model = (draft_model is not None
+                               and draft_model == model)
+
     def generator_inner():
 
         wait_for_gpu_memory_to_clear(
@@ -175,6 +184,13 @@ def generator_inner():
 
         print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
         llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
+
+        # Override logging interval to 0 for spec decode test run to
+        # log all metrics in time.
+        if (baseline_or_test == "test" and not use_async
+                and llm.llm_engine.log_stats):
+            for sate_logger in llm.llm_engine.stat_loggers.values():
+                sate_logger.local_interval = 0
         set_random_seed(seed)
 
         yield llm
@@ -186,6 +202,9 @@ def generator_outer():
             yield llm
             del llm
 
+    # Set an attribute to the generator_outer function to allow us to
+    # determine whether to further check the acceptance rate in tests.
+    generator_outer.same_draft_target_model = same_draft_target_model  # type: ignore
     return generator_outer
 
 
@@ -202,18 +221,27 @@ def maybe_assert_ngram_worker(llm):
 
 def get_output_from_llm_generator(
         llm_generator, prompts,
-        sampling_params) -> Tuple[List[str], List[List[int]]]:
+        sampling_params) -> Tuple[List[str], List[List[int]], float]:
     tokens: List[str] = []
     token_ids: List[List[int]] = []
+    acceptance_rate: float = -1.0
     for llm in llm_generator():
         maybe_assert_ngram_worker(llm)
 
         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+
         token_ids = [output.outputs[0].token_ids for output in outputs]
         tokens = [output.outputs[0].text for output in outputs]
+
+        # Fetch acceptance rate if logging is enabled.
+        if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
+            stat_logger = stat_loggers["prometheus"]
+            acceptance_rate = (stat_logger.metrics.
+                               gauge_spec_decode_draft_acceptance_rate.labels(
+                                   **stat_logger.labels)._value.get())
         del llm
 
-    return tokens, token_ids
+    return tokens, token_ids, acceptance_rate
 
 
 def get_logprobs_from_llm_generator(
@@ -235,12 +263,37 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
                                          batch_size,
                                          max_output_len,
                                          force_output_len: bool,
-                                         print_tokens: bool = False):
+                                         print_tokens: bool = False,
+                                         ensure_all_accepted: bool = False):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
     """
-    temperature = 0.0
+
+    run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len,
+                                  force_output_len,
+                                  temperature=0.0,
+                                  seeded=False,
+                                  print_tokens=print_tokens,
+                                  ensure_all_accepted=ensure_all_accepted)
+
+
+def run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len,
+                                  force_output_len: bool,
+                                  temperature: float,
+                                  seeded: bool,
+                                  print_tokens: bool = False,
+                                  ensure_all_accepted: bool = False):
+    """Helper method that compares the outputs of both the baseline LLM and
+    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+    the same when temperature is zero (or when temperature is > 0 and seeded).
+    """
 
     prompts = [
         "Hello, my name is",
@@ -259,18 +312,29 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
     # sampling params to ignore eos token.
     ignore_eos = force_output_len
 
-    sampling_params = SamplingParams(
-        max_tokens=max_output_len,
-        ignore_eos=ignore_eos,
-        temperature=temperature,
-    )
+    if seeded:
+        sampling_params = [
+            SamplingParams(
+                max_tokens=max_output_len,
+                ignore_eos=ignore_eos,
+                temperature=temperature,
+                seed=i,
+            ) for i in range(len(prompts))
+        ]
+    else:
+        sampling_params = SamplingParams(
+            max_tokens=max_output_len,
+            ignore_eos=ignore_eos,
+            temperature=temperature,
+        )
 
-    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
+    (spec_batch_tokens, spec_batch_token_ids,
+     acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
 
-    (baseline_batch_tokens,
-     baseline_batch_token_ids) = get_output_from_llm_generator(
-         baseline_llm_generator, prompts, sampling_params)
+    (baseline_batch_tokens, baseline_batch_token_ids,
+     _) = get_output_from_llm_generator(baseline_llm_generator, prompts,
+                                        sampling_params)
 
     assert len(baseline_batch_token_ids) == len(prompts)
     assert len(spec_batch_token_ids) == len(prompts)
@@ -285,3 +349,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         print(f'{i=} {baseline_token_ids=}')
         print(f'{i=}     {spec_token_ids=}')
         assert baseline_token_ids == spec_token_ids
+
+    if ensure_all_accepted:
+        assert acceptance_rate == 1.0
diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py
deleted file mode 100644
index d444ef24cbfda..0000000000000
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""Tests which cover integration of the speculative decoding framework with
-tensor parallelism.
-"""
-
-import pytest
-import torch
-
-from vllm.utils import is_hip
-
-from .conftest import run_greedy_equality_correctness_test
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
-
-        # Use AsyncLLM engine, so that the engine runs in its own process.
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
-        # process will have both the engine and the rank0 worker. NCCL is not
-        # cleaned up properly, and its server host thread leaks, causing the
-        # second run of the test to fail with internal NCCL error.
-        "use_async": True,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-    },
-])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
-    """Verify greedy equality when tensor parallelism is used.
-    """
-    if is_hip():
-        pytest.skip("hip is not well-supported yet")
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
new file mode 100644
index 0000000000000..944b28a2d14fa
--- /dev/null
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -0,0 +1,126 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+
+        # precision
+        "dtype": "float32",
+    }])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs, test_llm_kwargs",
+    [
+        (
+            {
+                # Use a small model for a fast test.
+                # Note this is repeated in the test body; to initialize a
+                # tokenizer.
+                "model": "JackFram/llama-68m",
+            },
+            {
+                "speculative_model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "speculative_draft_tensor_parallel_size": 1,
+            }),
+        ({
+            "model": "ibm-granite/granite-3b-code-instruct",
+        }, {
+            "speculative_model":
+            "ibm-granite/granite-3b-code-instruct-accelerator",
+            "num_speculative_tokens": 5,
+            "speculative_draft_tensor_parallel_size": 1,
+        })
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
new file mode 100644
index 0000000000000..49e4a5f8150b5
--- /dev/null
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -0,0 +1,122 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 4,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 4,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+
+            # Artificially limit the draft model max model len; this forces vLLM
+            # to skip speculation once the sequences grow beyond 32-k tokens.
+            "speculative_max_model_len": 32,
+        },
+    ])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # This must be a good bit larger than speculative_max_model_len so that
+        # we can test the case where all seqs are skipped, but still small to
+        # ensure fast test.
+        64,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+                          batch_size: int, output_len: int):
+    """Verify job failure with RuntimeError when all sequences skip speculation.
+    We do this by setting the max model len of the draft model to an
+    artificially low value, such that when the sequences grow beyond it, they
+    are skipped in speculative decoding.
+
+    TODO: fix it to pass without raising Error. (#5814)
+    """
+    with pytest.raises(RuntimeError):
+        run_greedy_equality_correctness_test(baseline_llm_generator,
+                                             test_llm_generator,
+                                             batch_size,
+                                             max_output_len=output_len,
+                                             force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 9572aac7df6e0..6fbe8c11d76fb 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -22,10 +22,12 @@
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_model": "JackFram/llama-160m",
-    "num_speculative_tokens": 3,
-}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-160m",
+                             "num_speculative_tokens": 3,
+                             "disable_logprobs_during_spec_decoding": False,
+                         }])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -59,10 +61,12 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_model": "JackFram/llama-160m",
-    "num_speculative_tokens": 3,
-}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-160m",
+                             "num_speculative_tokens": 3,
+                             "disable_logprobs_during_spec_decoding": False,
+                         }])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("num_logprobs", [6])
 @pytest.mark.parametrize(
@@ -99,13 +103,16 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_model": "JackFram/llama-160m",
-    "num_speculative_tokens": 3,
-}, {
-    "speculative_model": "JackFram/llama-160m",
-    "num_speculative_tokens": 6,
-}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-160m",
+                             "num_speculative_tokens": 3,
+                             "disable_logprobs_during_spec_decoding": False,
+                         }, {
+                             "speculative_model": "JackFram/llama-160m",
+                             "num_speculative_tokens": 6,
+                             "disable_logprobs_during_spec_decoding": False,
+                         }])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -143,6 +150,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
     [{
         "speculative_model": "JackFram/llama-160m",
         "num_speculative_tokens": 3,
+        "disable_logprobs_during_spec_decoding": False,
 
         # Artificially limit the draft model max model len; this forces vLLM
         # to skip speculation once the sequences grow beyond 32-k tokens.
@@ -181,10 +189,12 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_model": "JackFram/llama-160m",
-    "num_speculative_tokens": 3,
-}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-160m",
+                             "num_speculative_tokens": 3,
+                             "disable_logprobs_during_spec_decoding": False,
+                         }])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
new file mode 100644
index 0000000000000..7e4a6cc62d02b
--- /dev/null
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -0,0 +1,226 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, Medusa would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+# lmsys/vicuna-7b-v1.3 was to be used but it's causing
+# OOM in CI pipeline, so using a smaller model.
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
new file mode 100644
index 0000000000000..dd67a7735a647
--- /dev/null
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -0,0 +1,216 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, MLPSpeculator would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
+
+# speculative model
+SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
+
+# max. number of speculative tokens: this corresponds to
+# n_predict in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 94d71fb012727..86cab7aba2380 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -11,9 +11,15 @@
 numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
 equality. This gives us good coverage of temp=0.
 
+At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
+highest probability in the target distribution are accepted. Therefore, we can 
+expect greedy equality for the TypicalAcceptanceSampler at temp=0.
+
 For temp>0, we rely on unit tests on the rejection sampler to verify that the
 output distribution is the same with spec decode vs. no spec decode (this would
-be prohibitively expensive to run with a real model).
+be prohibitively expensive to run with a real model). Similarly, for the
+TypicalAcceptance sampler also, we rely on unit tests to validate temp>0
+test cases.
 
 NOTE: Speculative decoding's distribution equality requires that the measured
 distributions of the target model and proposal model be deterministic given the
@@ -91,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         temperature=temperature,
     )
 
-    batch_tokens, batch_token_ids = get_output_from_llm_generator(
+    batch_tokens, batch_token_ids, _ = get_output_from_llm_generator(
         test_llm_generator, prompts, sampling_params)
 
     # Expect a generation for each prompt in the batch.
@@ -194,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
 
     Since this test is cheaper than other e2e correctness tests, we generate
     with a higher output_len.
+
+    When the draft model is the same as the target model, we further check
+    whether all speculative tokens are accepted.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    ensure_all_accepted = test_llm_generator.same_draft_target_model
+    run_greedy_equality_correctness_test(
+        baseline_llm_generator,
+        test_llm_generator,
+        batch_size,
+        max_output_len=output_len,
+        force_output_len=True,
+        ensure_all_accepted=ensure_all_accepted)
 
 
 @pytest.mark.parametrize(
@@ -611,3 +623,49 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
                                          batch_size,
                                          max_output_len=output_len,
                                          force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+            "spec_decoding_acceptance_method": "typical_acceptance_sampler"
+        }
+        # Try a range of common k.
+        for k in [1, 2, 3]
+    ])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_typical_acceptance_sampling(baseline_llm_generator,
+                                     test_llm_generator, batch_size: int,
+                                     output_len: int):
+    """Verify that speculative decoding produces exact equality to without spec
+    decode with TypicalAcceptanceSampler as the draft token acceptance
+    sampling method.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
new file mode 100644
index 0000000000000..792d7cba0f270
--- /dev/null
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -0,0 +1,44 @@
+import pytest
+
+from .conftest import run_equality_correctness_test
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # speculative model
+        "speculative_model": "JackFram/llama-160m",
+
+        # num speculative tokens
+        "num_speculative_tokens": 3,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [1, 8, 32])
+@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        10,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_seeded_consistency(baseline_llm_generator, batch_size: int,
+                            temperature: float, output_len: int):
+    """Verify outputs are consistent across multiple runs with same seed
+    """
+    run_equality_correctness_test(baseline_llm_generator,
+                                  baseline_llm_generator,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=temperature,
+                                  seeded=True,
+                                  force_output_len=True)
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 42dd90422ec47..c350a2c55396e 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -90,10 +90,10 @@ def test_create_single_target_seq_group_metadata(k: int):
 
     assert output.request_id == input_seq_group_metadata.request_id
     assert len(output.seq_data) == 1
-    assert output.seq_data[target_seq_id].get_prompt_token_ids(
-    ) == prompt_tokens
-    assert output.seq_data[target_seq_id].get_output_token_ids(
-    ) == prev_output_tokens + token_ids
+    assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
+        prompt_tokens)
+    assert output.seq_data[target_seq_id].get_output_token_ids() == tuple(
+        prev_output_tokens + token_ids)
 
     assert len(output.block_tables) == 1
     assert output.block_tables[
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index bb6d1c23a0039..aa49a3aee62aa 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -3,33 +3,36 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, mock_worker
 
 
 @pytest.mark.parametrize('queue_size', [4])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('k', [1])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
+                             acceptance_sampler_method: str):
     """Verify that speculative tokens are disabled when the batch size
     exceeds the threshold.
     """
     disable_by_batch_size = 3
-
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     worker = SpecDecodeWorker(proposer_worker=draft_worker,
                               scorer_worker=target_worker,
-                              rejection_sampler=rejection_sampler,
+                              spec_decode_sampler=mock_spec_decode_sampler(
+                                  acceptance_sampler_method),
+                              disable_logprobs=False,
                               metrics_collector=metrics_collector,
                               disable_by_batch_size=disable_by_batch_size)
 
@@ -68,14 +71,17 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
     if queue_size < disable_by_batch_size:
         # Should raise exception when executing the mocked draft model.
         with pytest.raises(ValueError, match=exception_secret):
-            proposer.get_spec_proposals(execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                num_lookahead_slots=k), )
+            proposer.get_spec_proposals(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list,
+                    num_lookahead_slots=k),
+                seq_ids_with_bonus_token_in_last_step=set())
     else:
         # Should not execute the draft model because spec decode is disabled
         # for all requests. Accordingly, the proposal length should be 0.
         proposals = proposer.get_spec_proposals(
             execute_model_req=ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
-                num_lookahead_slots=k), )
+                num_lookahead_slots=k),
+            seq_ids_with_bonus_token_in_last_step=set())
         assert proposals.proposal_lens.tolist() == [0] * batch_size
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 312878804b86e..7477486a3388d 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -10,16 +10,16 @@
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=0)
     maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
     assert maybe_metrics is None
@@ -28,14 +28,14 @@ def test_initial_call_returns_none():
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -43,7 +43,7 @@ def test_second_call_returns_metrics():
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -56,16 +56,16 @@ def test_second_call_returns_metrics():
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=rank)
     _ = collector.maybe_collect_rejsample_metrics(k=5)
     metrics = collector.maybe_collect_rejsample_metrics(k=5)
@@ -75,14 +75,14 @@ def test_nonzero_rank_noop(rank):
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -91,7 +91,7 @@ def test_noop_until_time():
         collect_interval_s + 0.1, collect_interval_s + 0.1
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -105,6 +105,49 @@ def test_noop_until_time():
     assert metrics is not None
 
 
+def test_timer_is_reset():
+    """Verify that the internal timer inside AsyncMetricsCollector
+    is reset after collection.
+    """
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0,
+        collect_interval_s + 0.1,
+        collect_interval_s + 0.1,
+        collect_interval_s + 0.2,
+        collect_interval_s + 0.2,
+        2 * collect_interval_s + 0.1,
+        2 * collect_interval_s + 0.1,
+    ]
+
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is None
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+
 @pytest.mark.parametrize("has_data", [True, False])
 def test_initial_metrics_has_correct_values(has_data: bool):
     """Test correctness of metrics data.
@@ -122,14 +165,14 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
         num_draft_tokens, k)
 
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = num_draft_tokens
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = num_draft_tokens
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -137,7 +180,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index a6eb628f9198f..442e40f07f0bb 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -7,6 +7,7 @@
 
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -85,6 +86,7 @@ def test_same_output_for_single_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
     worker = create_worker(
         Worker,
@@ -116,7 +118,8 @@ def test_same_output_for_single_step():
     actual_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=multi_step_seq_group),
-        sample_len=num_steps)
+        sample_len=num_steps,
+        seq_ids_with_bonus_token_in_last_step=set())
     assert len(actual_output) == num_steps
     actual_output = actual_output[0]
 
@@ -168,6 +171,7 @@ def test_same_output_for_multi_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
 
     worker = create_worker(
@@ -207,7 +211,8 @@ def test_same_output_for_multi_step():
     multi_step_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list),
-        sample_len=num_steps)
+        sample_len=num_steps,
+        seq_ids_with_bonus_token_in_last_step=set())
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
@@ -274,6 +279,203 @@ def test_same_output_for_multi_step():
                                       single_step_logprobs)
 
 
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_correct_output():
+    """
+    In this test we verify that the MultiStepWorker is able to handle bonus
+    tokens correctly. The test verifies that if a sequence has a
+    bonus token then the MultiStepWorker is able to expand the batch by adding
+    new sequences corresponding to the sequences with bonus tokens. The
+    expanded batch is then used for predicting the next tokens.
+    """
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 128
+    multi_step_worker = create_worker(
+        MultiStepWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+    worker = create_worker(
+        Worker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    random.seed(seed)
+    prompts = [[0] for _ in range(batch_size)]
+    num_steps = 2
+    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+    multi_step_worker.execute_model = patch_execute_model_with_seeds(
+        multi_step_worker, rand_seeds)
+    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+    # Create the test continuations
+    continuations = [[random.randint(0, 1000)] for _ in prompts]
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run single-step twice to generate 2 tokens. This
+    # will simulate the bonus token case with the second token
+    # being the bonus token.
+    zero_kv_cache(worker.cache_engine)
+    single_step_output: List[SamplerOutput] = []
+    set_random_seed(seed)
+    for _ in range(num_steps):
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output.extend(
+            worker.execute_model(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list)))
+        # Append output tokens to new sequence data.
+        for i, seq_group_output in enumerate(single_step_output[-1]):
+            continuations[i].append(seq_group_output.samples[0].output_token)
+
+    # Create continuations for the MultiStepWorker. The continuations have
+    # 2 tokens in order to simulate the bonus token case.
+    multi_step_continuations = []
+    for continuation in continuations:
+        multi_step_continuations.append(continuation[:2])
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=multi_step_continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run multi-step and verify that the third token prediction is accurate
+    # for all sequences.
+    zero_kv_cache(multi_step_worker.cache_engine)
+    all_seq_ids = {i for i in range(batch_size)}
+    multi_step_output, _ = multi_step_worker.sampler_output(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+        sample_len=1,
+        seq_ids_with_bonus_token_in_last_step=all_seq_ids)
+    for index, output in enumerate(multi_step_output[-1].outputs):
+        assert (continuations[index][-1] == output.samples[0].output_token)
+
+
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_incorrect_output():
+    """
+    Tests the MultiStepWorker's ability to handle batch expansion with bonus
+    tokens in a negative case scenario. This test provides the MultiStepWorker
+    with a batch containing sequences with bonus tokens but specifies the
+    sequence IDs with bonus tokens incorrectly. The test verifies that the
+    MultiStepWorker generates correct tokens for the sequences where the
+    sequence ID is specified correctly and incorrect tokens for those where
+    the sequence ID is specified incorrectly.
+    """
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 128
+    multi_step_worker = create_worker(
+        MultiStepWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+    worker = create_worker(
+        Worker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    random.seed(seed)
+    prompts = [[0] for _ in range(batch_size)]
+    num_steps = 2
+    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+    multi_step_worker.execute_model = patch_execute_model_with_seeds(
+        multi_step_worker, rand_seeds)
+    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+    # Create the test continuations
+    continuations = [[random.randint(0, 1000)] for _ in prompts]
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=continuations,
+        final_prompt_lens=final_prompt_lens)
+    # Run single-step twice to generate 2 tokens. This
+    # will simulate the bonus token case with the second token
+    # being the bonus token.
+    zero_kv_cache(worker.cache_engine)
+    single_step_output: List[SamplerOutput] = []
+    set_random_seed(seed)
+    for _ in range(num_steps):
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output.extend(
+            worker.execute_model(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list)))
+        # Append output tokens to new sequence data.
+        for i, seq_group_output in enumerate(single_step_output[-1]):
+            continuations[i].append(seq_group_output.samples[0].output_token)
+
+    # Create continuations for the MultiStepWorker. The continuations have
+    # 2 tokens in order to simulate the bonus token case.
+    multi_step_continuations = []
+    for continuation in continuations:
+        multi_step_continuations.append(continuation[:2])
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=multi_step_continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run multi-step. In this run INCORRECTLY specify that only the odd number
+    # sequences have bonus tokens. Verify that with this setting the third token
+    # prediction is accurate only for the odd numbered sequences. Also verify
+    # that the prediction might be wrong for some of the even numbered
+    # sequences.
+    zero_kv_cache(multi_step_worker.cache_engine)
+    set_random_seed(seed)
+    odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0}
+    multi_step_output, _ = multi_step_worker.sampler_output(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+        sample_len=1,
+        seq_ids_with_bonus_token_in_last_step=odd_seq_ids)
+    num_mismatch = 0
+    for index, output in enumerate(multi_step_output[-1].outputs):
+        if (index % 2) != 0:
+            assert (continuations[index][-1] == output.samples[0].output_token)
+        elif (continuations[index][-1] != output.samples[0].output_token):
+            num_mismatch += 1
+    # The prediction is accurate for some of the sequences even without proper
+    # handling of the bonus tokens. Hence verify that the number of sequences
+    # for which there is a mismatch is > 0.
+    assert (num_mismatch > 0)
+
+
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
@@ -315,7 +517,8 @@ def test_draft_proposals_full_speculation_len():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -353,7 +556,8 @@ def test_draft_proposals_no_speculations():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -425,7 +629,8 @@ def test_draft_proposals_mixed_k():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -437,3 +642,51 @@ def test_draft_proposals_mixed_k():
     assert proposals.proposal_lens.tolist() == [
         k for _ in range(expected_num_proposal_seqs - 1)
     ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k]
+
+
+@torch.inference_mode()
+def test_use_draft_model_runner_advance_step():
+    """Verify that draft model runner triggers advance step
+    when applicable.
+    """
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+
+    k = 5
+    batch_size = 32
+    block_size = 32
+    num_gpu_blocks = 2048 // block_size
+    worker = create_worker(
+        MultiStepWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+
+    # Mock "_gpu_advance_step" to raise an exception when called.
+    exception_secret = "artificial stop"
+    worker.model_runner._gpu_advance_step = MagicMock()
+    worker.model_runner._gpu_advance_step.side_effect = ValueError(
+        exception_secret)
+
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+
+    # Fallback (should not call) when num_steps=1.
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        num_steps=1)
+    worker.execute_model(execute_model_req=execute_model_req)
+
+    # Expect exception if _gpu_advance_step is called.
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        num_steps=k)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(execute_model_req=execute_model_req)
+    call_args_list = worker.model_runner._gpu_advance_step.call_args_list
+    assert len(call_args_list) == 1
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index b1537884f896e..3995f87898afb 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -53,7 +53,8 @@ def test_ngram_algo_correctness_for_single_no_match():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -121,7 +122,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -193,7 +195,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index a20c793c9bfd7..671c9bef294f9 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,12 +1,12 @@
 import random
+from collections import defaultdict
 from types import SimpleNamespace
-from typing import Dict, List
+from typing import Dict, List, Set
 from unittest.mock import MagicMock
 
 import pytest
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
@@ -16,23 +16,26 @@
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_draft_model(k: int, batch_size: int):
+def test_correctly_calls_draft_model(k: int, batch_size: int,
+                                     acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the draft worker with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
-
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
@@ -53,15 +56,16 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_target_model(k: int, batch_size: int):
+def test_correctly_calls_target_model(k: int, batch_size: int,
+                                      acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the target model with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     draft_worker.device = 'cuda'
@@ -69,8 +73,9 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     worker.init_device()
 
     vocab_size = 32_000
@@ -133,8 +138,11 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
+                                             acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the rejection sampler with
     correct inputs. Everything else is mocked out.
     """
@@ -144,15 +152,14 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -199,15 +206,16 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     target_worker.execute_model.return_value = [target_output[0]]
 
     exception_secret = 'artificial stop'
-    rejection_sampler.side_effect = ValueError(exception_secret)
+
+    spec_decode_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    assert len(rejection_sampler.call_args_list) == 1
-    _, kwargs = rejection_sampler.call_args_list[0]
+    assert len(spec_decode_sampler.call_args_list) == 1
+    _, kwargs = spec_decode_sampler.call_args_list[0]
     actual = SimpleNamespace(**kwargs)
 
     assert torch.equal(actual.bonus_token_ids,
@@ -221,8 +229,11 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_formats_output(k: int, batch_size: int):
+def test_correctly_formats_output(k: int, batch_size: int,
+                                  acceptance_sampler_method: str):
     """Verify SpecDecodeWorker formats sampler output correctly.
     Everything else is mocked out.
     """
@@ -232,15 +243,13 @@ def test_correctly_formats_output(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
-
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -286,24 +295,23 @@ def test_correctly_formats_output(k: int, batch_size: int):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
 
-    rejection_sampler.return_value = rejection_sampler_output
-
+    spec_decode_sampler.return_value = spec_decode_sampler_output
     output = worker.execute_model(execute_model_req=ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k))
 
     expected_output = create_sampler_output_list(
-        token_ids=rejection_sampler_output.transpose(0, 1),
+        token_ids=spec_decode_sampler_output.transpose(0, 1),
         probs=[None for _ in range(k + 1)],
         logprobs=[None for _ in range(k + 1)])
 
@@ -350,8 +358,11 @@ def test_correctly_formats_output(k: int, batch_size: int):
 @pytest.mark.parametrize('k', [1, 2])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('returns_metrics', [True, False])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
+                          acceptance_sampler_method: str):
     """Verify SpecDecodeWorker collects metrics.
     """
     vocab_size = 32_000
@@ -360,16 +371,18 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              disable_logprobs=False,
+                              metrics_collector=metrics_collector)
     worker.init_device()
 
     proposal_token_ids = torch.randint(low=0,
@@ -414,17 +427,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
-
-    rejection_sampler.return_value = rejection_sampler_output
+    spec_decode_sampler.return_value = spec_decode_sampler_output
 
     mock_rejsample_metrics = MagicMock(
         spec=SpecDecodeWorkerMetrics) if returns_metrics else None
@@ -445,15 +457,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
 @pytest.mark.parametrize('k', [0])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_k_equals_zero(k: int, batch_size: int):
+def test_k_equals_zero(k: int, batch_size: int,
+                       acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when k is zero. This happens during prefill.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -465,8 +478,10 @@ def test_k_equals_zero(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), False,
+        metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -477,9 +492,10 @@ def test_k_equals_zero(k: int, batch_size: int):
     out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
-    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_probs is None, (
+        "expect gpu tensor references to be None")
     assert out[
-        0].sampled_tokens is None, "expect gpu tensor references to be None"
+        0].sampled_token_ids is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(execute_model_req)
     target_worker.execute_model.assert_called_once_with(execute_model_req)
@@ -487,16 +503,17 @@ def test_k_equals_zero(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [0, 5])
 @pytest.mark.parametrize('batch_size', [0])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_empty_input_batch(k: int, batch_size: int):
+def test_empty_input_batch(k: int, batch_size: int,
+                           acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when the input batch is empty. This can happen if the engine communicates
     to the workers information without scheduling a batch.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -508,8 +525,10 @@ def test_empty_input_batch(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), False,
+        metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -520,28 +539,29 @@ def test_empty_input_batch(k: int, batch_size: int):
     out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
-    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_probs is None, (
+        "expect gpu tensor references to be None")
     assert out[
-        0].sampled_tokens is None, "expect gpu tensor references to be None"
+        0].sampled_token_ids is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(execute_model_req)
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
-def test_init_device():
+def test_init_device(acceptance_sampler_method: str):
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
     well as other GPU initialization.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
-
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
+                              False, metrics_collector)
     worker.init_device()
 
     draft_worker.init_device.assert_called_once()
@@ -549,22 +569,23 @@ def test_init_device():
     target_worker.init_device.assert_called_once()
 
     metrics_collector.init_gpu_tensors.assert_called_once()
-    rejection_sampler.init_gpu_tensors.assert_called_once()
+    spec_decode_sampler.init_gpu_tensors.assert_called_once()
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_initialize_cache():
+def test_initialize_cache(acceptance_sampler_method):
     """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
     workers.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
     worker.initialize_cache(**kwargs)
@@ -577,19 +598,20 @@ def test_initialize_cache():
 @pytest.mark.parametrize('available_cpu_blocks', [500])
 @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
 @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
 def test_determine_num_available_blocks(available_gpu_blocks: int,
                                         available_cpu_blocks: int,
                                         target_cache_block_size_bytes: int,
-                                        draft_kv_size_bytes: int):
+                                        draft_kv_size_bytes: int,
+                                        acceptance_sampler_method: str):
     """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
     Specifically, it should run profiling in the scorer worker, and then evenly
     split the blocks between proposer and scorer worker.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     target_worker.determine_num_available_blocks.return_value = (
@@ -598,8 +620,9 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
         target_cache_block_size_bytes)
     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
 
@@ -629,3 +652,141 @@ def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
     assert (num_blocks * target_cache_block_size_bytes) + (
         num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
                                               target_cache_block_size_bytes)
+
+
+@torch.inference_mode()
+def test_populate_seq_ids_with_bonus_tokens():
+    """
+    Verify that a call to _create_output_sampler_list correctly updates
+    seq_with_bonus_token_in_last_step.
+
+    seq_with_bonus_token_in_last_step is an internal data structure in
+    SpecDecodeWorker that tracks the sequence IDs which are assigned bonus
+    tokens by the target model in their last forward pass. This state is
+    maintained only for models relying on the KV cache, such as those using
+    the MultiStepWorker.
+    """
+    batch_size = 10
+    k = 5
+    vocab_size = 10000
+    num_sequences_with_bonus_tokens = 5
+    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    draft_worker.device = 'cuda'
+    # The sequence_ids attached to each sequence in the batch.
+    # The sequence at index i has seq_id assigned_seq_ids[i]
+    assigned_seq_ids = list(range(batch_size))
+    seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                 k,
+                                                 seq_ids=assigned_seq_ids,
+                                                 prev_output_token_len=10)
+    target_token_logprobs = torch.rand(batch_size, (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
+    accepted_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, (k + 1)),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    for seq_group_metadata in seq_group_metadata_list:
+        for seq_id in seq_group_metadata.seq_data:
+            expected_request_id_seq_ids_mapping[
+                seq_group_metadata.request_id].add(seq_id)
+    # Generate a random sample of sequence indexes with bonus tokens
+    seq_indexes_with_bonus_tokens = random.sample(
+        range(batch_size), num_sequences_with_bonus_tokens)
+    # Create a mask that is True for indices in seq_indexes_with_bonus_tokens
+    mask = torch.ones(batch_size, dtype=torch.bool, device='cuda')
+    mask[seq_indexes_with_bonus_tokens] = False
+    # Set the last token ID to -1 for all indices not in
+    # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in
+    # those indices.
+    accepted_token_ids[mask, -1:] = -1
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              mock_spec_decode_sampler("rejection_sampler"),
+                              disable_logprobs=False,
+                              metrics_collector=metrics_collector)
+    # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs.
+    # This set includes all sequence IDs in the batch as well as an additional
+    # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in
+    # the range [0, batch_size + num_extra_sequence_ids).
+    num_extra_sequence_ids = 10
+    worker._seq_with_bonus_token_in_last_step = set(
+        range(batch_size + num_extra_sequence_ids))
+    worker._create_output_sampler_list(
+        seq_group_metadata_list=seq_group_metadata_list,
+        accepted_token_ids=accepted_token_ids,
+        target_logprobs=target_token_logprobs,
+        k=k)
+    # Verify that _seq_with_bonus_token_in_last_step contains the following:
+    # 1. Sequence IDs that were already present in
+    #    _seq_with_bonus_token_in_last_step but were not part of the current
+    #    batch are retained.
+    # 2. Of the sequence IDs present in the current batch, only those with a
+    #    bonus token are retained in _seq_with_bonus_token_in_last_step.
+    #    Sequence IDs that are present in the current batch but do not have
+    #    bonus tokens are removed from _seq_with_bonus_token_in_last_step.
+    expected_seq_ids_with_bonus_tokens = \
+        set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens])
+    additional_sequence_ids = \
+        set(range(batch_size, batch_size + num_extra_sequence_ids))
+    assert worker._seq_with_bonus_token_in_last_step == \
+        expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
+    assert worker._request_id_seq_id_mapping == \
+        expected_request_id_seq_ids_mapping
+
+
+@torch.inference_mode()
+def test_handle_finished_requests():
+    """
+    Test to verify that finished request IDs are appropriately processed to 
+    update the internal state of the SpecDecodeWorker.
+
+    This test initializes the SpecDecodeWorker with mock data, marks certain 
+    requests as finished, and ensures that the corresponding sequence IDs are 
+    correctly removed from the internal mappings.
+    """
+    batch_size = 32
+    k = 3
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(draft_worker, target_worker,
+                              mock_spec_decode_sampler("rejection_sampler"),
+                              metrics_collector)
+    # Initialize the request_id_seq_id_mapping mapping dict with a few fake
+    # request ids and corresponding sequence ids.
+    worker._request_id_seq_id_mapping = \
+        {'request-1': {1,2,3}, 'request-2': {4,5,6,7},
+        'request-3': {8,9}, 'request-4': {10,11}}
+    # Initialize seq_with_bonus_token_in_last_step with a few fake
+    # sequence ids.
+    worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10}
+    exception_secret = 'artificial stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+    # Mark requests with ids request-1 and request-3 as finished.
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        finished_requests_ids=['request-1', 'request-3'])
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(execute_model_req=execute_model_req)
+    # Verify that request-1 and request-3 are removed from
+    # request_id_seq_id_mapping
+    assert worker._request_id_seq_id_mapping == \
+        {'request-2': {4,5,6,7}, 'request-4': {10,11}}
+    # Verify that all sequence ids corresponding to 'request-1'
+    # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
+    assert worker._seq_with_bonus_token_in_last_step == \
+        {4,5,10}
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index bccbf9a6aaaeb..18dbdd5bc952f 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,7 +1,11 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
 
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
 from vllm.spec_decode.util import split_batch_by_proposal_len
 
@@ -109,3 +113,21 @@ def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
 
     assert filtered_groups == []
     assert indices == []
+
+
+def mock_spec_decode_sampler(acceptance_sampler_method):
+    """
+    Returns either a RejectionSampler or TypicalAcceptanceSampler
+    object depending on whether acceptance_sampler_method is 
+    'rejection_sampler' or 'typical_acceptance_sampler' respectively.
+    """
+    if acceptance_sampler_method == "rejection_sampler":
+        sampler = MagicMock(spec=RejectionSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    elif acceptance_sampler_method == "typical_acceptance_sampler":
+        sampler = MagicMock(spec=TypicalAcceptanceSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    else:
+        raise ValueError(f"Invalid sampler name {acceptance_sampler_method}")
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index ce5b347832c30..86148291ae6ff 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -14,6 +14,7 @@
                            SequenceOutput)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker import Worker
 
 T = TypeVar("T", bound=Worker)
@@ -53,9 +54,9 @@ def new_execute_model(*args, **kwargs):
     return new_execute_model
 
 
-def zero_kv_cache(cache_engine: CacheEngine):
-    assert cache_engine.gpu_cache
-    for key_blocks, value_blocks in cache_engine.gpu_cache:
+def zero_kv_cache(cache_engine: List[CacheEngine]):
+    assert cache_engine[0].gpu_cache
+    for key_blocks, value_blocks in cache_engine[0].gpu_cache:
         key_blocks.zero_()
         value_blocks.zero_()
 
@@ -66,7 +67,8 @@ def create_worker(cls: Callable[..., T],
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
-                  enforce_eager: bool = True) -> T:
+                  enforce_eager: bool = True,
+                  model_runner_cls: Optional[ModelRunner] = None) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -89,6 +91,7 @@ def create_worker(cls: Callable[..., T],
         rank=0,
         distributed_init_method=distributed_init_method,
         is_driver_worker=is_driver_worker,
+        model_runner_cls=model_runner_cls,
     )
 
     worker.init_device()
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index c8f86133f41ac..b7030e3cd6d42 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -6,7 +6,6 @@
 
 import openai
 import pytest
-import ray
 import torch
 from tensorizer import EncryptionParams
 
@@ -215,26 +214,26 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
     ## Start OpenAI API server
     openai_args = [
-        "--model", model_ref, "--dtype", "float16", "--load-format",
+        "--dtype", "float16", "--load-format",
         "tensorizer", "--model-loader-extra-config",
         json.dumps(model_loader_extra_config),
     ]
 
-    server = RemoteOpenAIServer(openai_args)
-    print("Server ready.")
+    with RemoteOpenAIServer(model_ref, openai_args) as server:
+        print("Server ready.")
 
-    client = server.get_client()
-    completion = client.completions.create(model=model_ref,
-                                           prompt="Hello, my name is",
-                                           max_tokens=5,
-                                           temperature=0.0)
+        client = server.get_client()
+        completion = client.completions.create(model=model_ref,
+                                            prompt="Hello, my name is",
+                                            max_tokens=5,
+                                            temperature=0.0)
 
-    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        assert completion.id is not None
+        assert len(completion.choices) == 1
+        assert len(completion.choices[0].text) >= 5
+        assert completion.choices[0].finish_reason == "length"
+        assert completion.usage == openai.types.CompletionUsage(
+            completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
@@ -280,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     base_model.model.llm_engine.model_executor.shutdown()
     del base_model
     cleanup()
-    ray.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -303,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
     cleanup()
-    ray.shutdown()
 
     loaded_vllm_model = vllm_runner(
         model_ref,
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
new file mode 100644
index 0000000000000..17b01651e39af
--- /dev/null
+++ b/tests/test_embedded_commit.py
@@ -0,0 +1,7 @@
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert vllm.__commit__ != "COMMIT_HASH_PLACEHOLDER"
+    # 7 characters is the length of a short commit hash
+    assert len(vllm.__commit__) >= 7
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 4ee980505a3ab..8ee2d78190cd1 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -83,7 +83,7 @@ def pick_ith(token_ids, logits):
         device=device,
         pin_memory=is_pin_memory_available())
     logits_processor_output = logits_processor(
-        embedding=None,
+        lm_head=None,
         hidden_states=input_tensor,
         sampling_metadata=sampling_metadata)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b674ea6a85c1..8203b5d2f960d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,8 @@
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
+from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
+                        get_open_port, merge_async_iterators)
 
 from .utils import error_on_warning
 
@@ -130,3 +131,61 @@ def test_get_open_port():
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                 s3.bind(("localhost", get_open_port()))
     os.environ.pop("VLLM_PORT")
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--image-input-type',
+                        choices=['pixel_values', 'image_features'])
+    parser.add_argument('--model-name')
+    parser.add_argument('--batch-size', type=int)
+    parser.add_argument('--enable-feature', action='store_true')
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(['--image_input_type', 'pixel_values'])
+    assert args.image_input_type == 'pixel_values'
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args([
+        '--image_input_type', 'image_features', '--model-name',
+        'facebook/opt-125m'
+    ])
+    assert args.image_input_type == 'image_features'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
+    assert args.image_input_type == 'pixel_values'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(['--batch_size', '32'])
+    assert args.batch_size == 32
+    args = parser.parse_args(['--batch-size', '32'])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(['--enable_feature'])
+    assert args.enable_feature is True
+    args = parser.parse_args(['--enable-feature'])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--image_input_type', 'invalid_choice'])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument('--required-arg', required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 12e5ae85adea6..f4551ed42efb8 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Any, Dict, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -139,6 +139,15 @@ def create_dummy_logprobs(
     } for token_id in complete_sequence_token_ids]
 
 
+def create_dummy_prompt_logprobs(
+        complete_sequence_token_ids: List[int]
+) -> List[Optional[Dict[int, Any]]]:
+    # logprob for the first prompt token is None.
+    logprobs: List[Optional[Dict[int, Any]]] = [None]
+    logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
+    return logprobs
+
+
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", [True, False])
@@ -177,13 +186,10 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True])
-def test_decode_prompt_logprobs(complete_sequence: str,
-                                complete_sequence_token_ids: List[int],
-                                detokenizer: Detokenizer,
-                                skip_special_tokens: bool):
+def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
+                                detokenizer: Detokenizer):
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
+    sampling_params = SamplingParams(skip_special_tokens=True,
                                      prompt_logprobs=1)
 
     # Run sequentially.
@@ -192,19 +198,78 @@ def test_decode_prompt_logprobs(complete_sequence: str,
                               seqs=[seq],
                               sampling_params=sampling_params,
                               arrival_time=0.0)
-    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs)
-    decoded_prompt_logprobs = dummy_logprobs
+    dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
+    detokenizer.decode_prompt_logprobs_inplace(seq_group,
+                                               dummy_logprobs,
+                                               position_offset=0)
+    # First logprob is None.
+    decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[
+        1:]  # type: ignore
 
-    if skip_special_tokens:
-        # Text for logprobs for the chosen token should be the same as the
-        # prompt text. Note that this will only be true if we skip
-        # special tokens.
-        assert complete_sequence == "".join([
-            logprobs[token_id].decoded_token for token_id, logprobs in zip(
-                complete_sequence_token_ids, decoded_prompt_logprobs)
-        ])
-        assert complete_sequence != "".join([
-            logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(
-                complete_sequence_token_ids, decoded_prompt_logprobs)
-        ])
+    # decoded_prompt_logprobs doesn't contain the first token.
+    token_ids = complete_sequence_token_ids
+    tokenzier = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
+    text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+    text = text_full[len(text_first):]
+
+    # Text for logprobs for the chosen token should be the same as the
+    # prompt text. Note that the first logprob is None.
+    assert text == "".join([
+        logprobs[token_id].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+    assert text != "".join([
+        logprobs[token_id + 1].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
+def test_decode_prompt_logprobs_chunked_prefill(
+    vllm_runner,
+    model,
+    chunked_prefill_token_size: int,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(model,
+                     dtype="half",
+                     max_logprobs=5,
+                     gpu_memory_utilization=0.5,
+                     enable_chunked_prefill=enable_chunked_prefill,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+
+        vllm_sampling_params = SamplingParams(max_tokens=10,
+                                              logprobs=5,
+                                              prompt_logprobs=5,
+                                              temperature=0.0)
+        vllm_results = vllm_model.model.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
+
+        for idx, result in enumerate(vllm_results):
+            assert result.prompt_logprobs is not None
+            assert result.prompt_logprobs[0] is None
+
+            # Compared detokenized prompts ids to original prompt.
+            generated_string = ""
+            for (prompt_token,
+                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
+                                         result.prompt_logprobs[1:]):
+                # prompt_logprobs is a dict of the token_id: logprob
+                # We select the token_id corresponding to the actual prompt
+                # Decoded token in the detokenized string corresponding to this
+                # prompt token.
+                generated_string += prompt_logprobs[prompt_token].decoded_token
+
+            assert generated_string == example_prompts[idx], (
+                "Detokenized prompt logprobs do not match original prompt")
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
new file mode 100644
index 0000000000000..875ca19d3b4b7
--- /dev/null
+++ b/tests/tokenization/test_get_eos.py
@@ -0,0 +1,31 @@
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+:meth:`vllm.LLMEngine._get_eos_token_id`.
+"""
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128009]
+
+
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_image_processor.py b/tests/tokenization/test_image_processor.py
deleted file mode 100644
index 5ba2323367414..0000000000000
--- a/tests/tokenization/test_image_processor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.transformers_utils.image_processor import get_image_processor
-
-IMAGE_PROCESSOR_NAMES = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-34b-hf",
-]
-
-
-@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
-def test_image_processor_revision(processor_name: str):
-    # Assume that "main" branch always exists
-    image_processor = get_image_processor(processor_name, revision="main")
-    assert isinstance(image_processor, BaseImageProcessor)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_image_processor(processor_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 31571dbfff6f6..3faaf326f5422 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,21 +1,34 @@
 import asyncio
 import os
+import sys
+from typing import List, Optional
 from unittest.mock import patch
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
+                                                     get_tokenizer_group)
 from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
     RayTokenizerGroupPool)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)
 
 from ..conftest import get_tokenizer_pool_config
 
 
+class CustomTokenizerGroup(TokenizerGroup):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._i = 0
+
+    def encode(self, *args, **kwargs):
+        self._i += 1
+        return super().encode(*args, **kwargs)
+
+
 @pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
+@pytest.mark.parametrize("tokenizer_group_type",
+                         [None, "ray", CustomTokenizerGroup])
 async def test_tokenizer_group(tokenizer_group_type):
     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
     tokenizer_group = get_tokenizer_group(
@@ -34,6 +47,8 @@ async def test_tokenizer_group(tokenizer_group_type):
                       PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
         None) == await tokenizer_group.get_lora_tokenizer_async(None)
+    if tokenizer_group_type is CustomTokenizerGroup:
+        assert tokenizer_group._i > 0
 
 
 @pytest.mark.asyncio
@@ -100,3 +115,100 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
             max_num_seqs=1,
             max_input_length=None)
         tokenizer_pool.ping()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
+async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
+    """Test that Ray tokenizer pool group can recover from failures and
+    if that's not possible, mark itself as unhealthy."""
+
+    class FailingTokenizerGroup(TokenizerGroup):
+
+        def __init__(self,
+                     *args,
+                     fail_at: Optional[List[int]] = None,
+                     **kwargs):
+            super().__init__(*args, **kwargs)
+            self.i = 0
+            self.fail_at = fail_at or []
+
+        def encode(self, *args, **kwargs):
+            self.i += 1
+            if self.i in self.fail_at:
+                sys.exit(1)
+            return super().encode(*args, **kwargs)
+
+    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
+        _worker_cls = FailingTokenizerGroup
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Modify fail at to not fail at all (will be re-read when actor is
+    # re-initialized).
+    fail_at[0] = 1000
+
+    # We should recover successfully.
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+
+    # Check that we have a new actor
+    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
+    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+
+    # We should fail after re-initialization.
+    with pytest.raises(RuntimeError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt",
+                                                lora_request=None)
+
+    # check_health should raise the same thing
+    with pytest.raises(RuntimeError):
+        tokenizer_group_pool.check_health()
+
+    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
+    # cause a re-initialization.
+    fail_at = []
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=2,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Prompt too long error
+    with pytest.raises(ValueError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt" * 100,
+                                                lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    # Actors should stay the same.
+    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/tests/utils.py b/tests/utils.py
index bc30515c83100..bf36d96108d8c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,73 +4,59 @@
 import time
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List
+from pathlib import Path
+from typing import Any, Dict, List
 
 import openai
 import ray
 import requests
+from transformers import AutoTokenizer
 
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port, is_hip
-
-if (not is_hip()):
+from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
+
+if is_hip():
+    from amdsmi import (amdsmi_get_gpu_vram_usage,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down)
+
+    @contextmanager
+    def _nvml():
+        try:
+            amdsmi_init()
+            yield
+        finally:
+            amdsmi_shut_down()
+else:
     from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit)
+                        nvmlInit, nvmlShutdown)
+
+    @contextmanager
+    def _nvml():
+        try:
+            nvmlInit()
+            yield
+        finally:
+            nvmlShutdown()
+
 
-# Path to root of repository so that utilities can be imported by ray workers
-VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+VLLM_PATH = Path(__file__).parent.parent
+"""Path to root of the vLLM repository."""
 
 
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    @ray.remote(num_gpus=1)
-    class _RemoteRunner:
-
-        def __init__(self, cli_args: List[str], *, wait_url: str,
-                     wait_timeout: float) -> None:
-            env = os.environ.copy()
-            env["PYTHONUNBUFFERED"] = "1"
-            self.proc = subprocess.Popen(
-                [
-                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-                    *cli_args
-                ],
-                env=env,
-                stdout=sys.stdout,
-                stderr=sys.stderr,
-            )
-
-            self._wait_for_server(url=wait_url, timeout=wait_timeout)
-
-        def ready(self):
-            return True
-
-        def _wait_for_server(self, *, url: str, timeout: float):
-            # run health check
-            start = time.time()
-            while True:
-                try:
-                    if requests.get(url).status_code == 200:
-                        break
-                except Exception as err:
-                    if self.proc.poll() is not None:
-                        raise RuntimeError(
-                            "Server exited unexpectedly.") from err
-
-                    time.sleep(0.5)
-                    if time.time() - start > timeout:
-                        raise RuntimeError(
-                            "Server failed to start in time.") from err
-
-        def __del__(self):
-            if hasattr(self, "proc"):
-                self.proc.terminate()
-
-    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+    def __init__(
+        self,
+        model: str,
+        cli_args: List[str],
+        *,
+        auto_port: bool = True,
+    ) -> None:
         if auto_port:
             if "-p" in cli_args or "--port" in cli_args:
                 raise ValueError("You have manually specified the port"
@@ -78,17 +64,46 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
 
             cli_args = cli_args + ["--port", str(get_open_port())]
 
-        parser = make_arg_parser()
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
         args = parser.parse_args(cli_args)
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
-        self._runner = self._RemoteRunner.remote(  # type: ignore
-            cli_args,
-            wait_url=self.url_for("health"),
-            wait_timeout=self.MAX_SERVER_START_WAIT_S)
-
-        self._wait_until_ready()
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
+                                     env=env,
+                                     stdout=sys.stdout,
+                                     stderr=sys.stderr)
+        self._wait_for_server(url=self.url_for("health"),
+                              timeout=self.MAX_SERVER_START_WAIT_S)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+
+    def _wait_for_server(self, *, url: str, timeout: float):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(url).status_code == 200:
+                    break
+            except Exception as err:
+                result = self.proc.poll()
+                if result is not None and result != 0:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > timeout:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
 
     @property
     def url_root(self) -> str:
@@ -97,9 +112,6 @@ def url_root(self) -> str:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def _wait_until_ready(self) -> None:
-        ray.get(self._runner.ready.remote())
-
     def get_client(self):
         return openai.OpenAI(
             base_url=self.url_for("v1"),
@@ -113,6 +125,99 @@ def get_async_client(self):
         )
 
 
+def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
+    """
+    Launch API server with two different sets of arguments and compare the
+    results of the API calls. The arguments are after the model name.
+    """
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    prompt = "Hello, my name is"
+    token_ids = tokenizer(prompt)["input_ids"]
+    results = []
+    for args in (arg1, arg2):
+        with RemoteOpenAIServer(model, args) as server:
+            client = server.get_client()
+
+            # test models list
+            models = client.models.list()
+            models = models.data
+            served_model = models[0]
+            results.append({
+                "test": "models_list",
+                "id": served_model.id,
+                "root": served_model.root,
+            })
+
+            # test with text prompt
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+            results.append({
+                "test": "single_completion",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test using token IDs
+            completion = client.completions.create(
+                model=model,
+                prompt=token_ids,
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "token_ids",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test simple list
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "simple_list",
+                "text0": batch.choices[0].text,
+                "text1": batch.choices[1].text,
+            })
+
+            # test streaming
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+                stream=True,
+            )
+            texts = [""] * 2
+            for chunk in batch:
+                assert len(chunk.choices) == 1
+                choice = chunk.choices[0]
+                texts[choice.index] += choice.text
+            results.append({
+                "test": "streaming",
+                "texts": texts,
+            })
+
+    n = len(results) // 2
+    arg1_results = results[:n]
+    arg2_results = results[n:]
+    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
+        assert arg1_result == arg2_result, \
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}"
+
+
 def init_test_distributed_environment(
     tp_size: int,
     pp_size: int,
@@ -129,13 +234,15 @@ def init_test_distributed_environment(
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
-def multi_process_tensor_parallel(
+def multi_process_parallel(
     tp_size: int,
     pp_size: int,
-    test_target,
+    test_target: Any,
 ) -> None:
     # Using ray helps debugging the error when it failed
     # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
     ray.init(runtime_env={"working_dir": VLLM_PATH})
 
     distributed_init_port = get_open_port()
@@ -160,20 +267,25 @@ def error_on_warning():
         yield
 
 
+@_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                  threshold_bytes: int,
                                  timeout_s: float = 120) -> None:
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
-    nvmlInit()
     start_time = time.time()
     while True:
         output: Dict[int, str] = {}
         output_raw: Dict[int, float] = {}
         for device in devices:
-            dev_handle = nvmlDeviceGetHandleByIndex(device)
-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
-            gb_used = mem_info.used / 2**30
+            if is_hip():
+                dev_handle = amdsmi_get_processor_handles()[device]
+                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
+                gb_used = mem_info["vram_used"] / 2**10
+            else:
+                dev_handle = nvmlDeviceGetHandleByIndex(device)
+                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+                gb_used = mem_info.used / 2**30
             output_raw[device] = gb_used
             output[device] = f'{gb_used:.02f}'
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
new file mode 100644
index 0000000000000..2126fafb2323b
--- /dev/null
+++ b/tests/worker/test_model_input.py
@@ -0,0 +1,156 @@
+import dataclasses
+from typing import List, Tuple, Type
+
+import torch
+
+from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.worker.embedding_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        raise AttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
+
+
+def test_embedding_model_runner_input():
+    pooling_metadata = PoolingMetadata(
+        seq_groups=[[0]],
+        seq_data={},
+        prompt_lens=[1],
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        pooling_metadata=pooling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithPoolingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # Pooling metadata is not broadcast.
+    assert received_model_input.pooling_metadata is None
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index dd0d3bf5082d9..b5742c4338616 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -23,6 +23,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
         cache_config=engine_config.cache_config,
         load_config=engine_config.load_config,
         lora_config=engine_config.lora_config,
+        prompt_adapter_config=engine_config.prompt_adapter_config,
         is_driver_worker=True,
     )
     return model_runner
@@ -61,12 +62,13 @@ def test_prepare_prompt(batch_size):
         expected_selected_token_indices.append(selected_token_start_idx +
                                                seq_len - 1)
         selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
     attn_metadata = model_input.attn_metadata
     return_seq_lens = model_input.seq_lens
-    slot_mapping = model_input.slot_mapping
+    slot_mapping = attn_metadata.slot_mapping
     assert return_seq_lens == seq_lens
     assert len(slot_mapping) == len(input_tokens)
 
@@ -174,10 +176,11 @@ def test_prepare_decode_cuda_graph(batch_size):
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
 
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata, slot_mapping = (
         model_input.input_tokens, model_input.input_positions,
-        model_input.attn_metadata, model_input.slot_mapping)
+        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
     expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
@@ -259,32 +262,29 @@ def test_empty_seq_group():
         enforce_eager=False,
     )
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata, slot_mapping = (
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata = (
         model_input.input_tokens,
         model_input.input_positions,
         model_input.attn_metadata,
-        model_input.slot_mapping,
     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata, slot_mapping,
-     return_seq_lens) = (
-         model_input.input_tokens,
-         model_input.input_positions,
-         model_input.attn_metadata,
-         model_input.slot_mapping,
-         model_input.seq_lens,
-     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+        model_input.seq_lens,
+    )
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-    assert len(return_seq_lens) == 0
+    assert return_seq_lens is None
 
 
 @pytest.fixture
@@ -353,8 +353,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
         seq_group_metadata_list.append(seq_group_metadata)
         decode_metadata_list.append(seq_group_metadata)
 
-    (input_tokens, input_positions, attn_metadata, _, _, _,
-     _) = model_runner.prepare_input_tensors(seq_group_metadata_list)
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+    )
 
     prefill_meta_actual = attn_metadata.prefill_metadata
     decode_meta_actual = attn_metadata.decode_metadata
@@ -367,7 +371,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
 
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input(
+    attn_metadata = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list).attn_metadata
 
     for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index d941ffdb5588a..7aa439ba0a154 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -39,8 +39,8 @@ def test_swap() -> None:
         num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
 
     # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine.gpu_cache
-    cpu_cache = worker.cache_engine.cpu_cache
+    gpu_cache = worker.cache_engine[0].gpu_cache
+    cpu_cache = worker.cache_engine[0].cpu_cache
     num_layers = len(gpu_cache)
     for i in range(num_layers):
         gpu_key_cache, gpu_value_cache = gpu_cache[i]
diff --git a/vllm/__init__.py b/vllm/__init__.py
index e217059873bf5..0895c571d1d89 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -5,20 +5,21 @@
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptStrictInputs, TextPrompt, TokensPrompt
+from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import (CompletionOutput, EmbeddingOutput,
                           EmbeddingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-from .version import __version__
+from .version import __commit__, __version__
 
 __all__ = [
+    "__commit__",
     "__version__",
     "LLM",
     "ModelRegistry",
-    "PromptStrictInputs",
+    "PromptInputs",
     "TextPrompt",
     "TokensPrompt",
     "SamplingParams",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e050c1172acb5..e5151c070f2f7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -84,7 +84,8 @@ def paged_attention_v1(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -94,8 +95,9 @@ def paged_attention_v1(
     torch.ops._C.paged_attention_v1(
         out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)
 
 
 def paged_attention_v2(
@@ -114,7 +116,8 @@ def paged_attention_v2(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -124,7 +127,7 @@ def paged_attention_v2(
     torch.ops._C.paged_attention_v2(
         out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
         blocksparse_local_blocks, blocksparse_vert_stride,
         blocksparse_block_size, blocksparse_head_sliding_step)
 
@@ -163,6 +166,18 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def advance_step(num_seqs: int, num_queries: int, block_size: int,
+                 input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
+                 input_positions: torch.Tensor, seq_lens: torch.Tensor,
+                 slot_mapping: torch.Tensor,
+                 block_tables: torch.Tensor) -> None:
+    """Advance a step on GPU for existing inputs for a multi-step runner"""
+    return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
+                                     input_tokens, sampled_token_ids,
+                                     input_positions, seq_lens, slot_mapping,
+                                     block_tables)
+
+
 # quantization ops
 # awq
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
@@ -220,9 +235,12 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
-def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
+                      out_dtype: Type[torch.dtype],
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
 
@@ -230,7 +248,8 @@ def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+
     return out
 
 
@@ -257,14 +276,31 @@ def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                                            num_bits)
 
 
+# gptq_marlin
+def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
+                      num_bits: int) -> torch.Tensor:
+    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+
+
 def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                     b_scales: torch.Tensor, g_idx: torch.Tensor,
-                     perm: torch.Tensor, workspace: torch.Tensor,
-                     num_bits: int, size_m: int, size_n: int, size_k: int,
-                     is_k_full: bool) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
-                                         workspace, num_bits, size_m, size_n,
-                                         size_k, is_k_full)
+                     b_scales: torch.Tensor, b_zeros: torch.Tensor,
+                     g_idx: torch.Tensor, perm: torch.Tensor,
+                     workspace: torch.Tensor, num_bits: int, size_m: int,
+                     size_n: int, size_k: int, is_k_full: bool,
+                     has_zp: bool) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
+                                         g_idx, perm, workspace, num_bits,
+                                         size_m, size_n, size_k, is_k_full,
+                                         has_zp)
+
+
+# fp8 marlin
+def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    b_scales: torch.Tensor, workspace: torch.Tensor,
+                    num_bits: int, size_m: int, size_n: int,
+                    size_k: int) -> torch.Tensor:
+    return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
+                                        num_bits, size_m, size_n, size_k)
 
 
 # fp8
@@ -272,6 +308,8 @@ def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     batch_dim_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -285,8 +323,12 @@ def scaled_fp8_quant(
     Args:
         input: The input tensor to be quantized to FP8
         scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic 
+            per token case
         batch_dim_padding: If specified, pad the first dimension
             of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token 
+            in the dynamic quantization case.
 
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
@@ -300,10 +342,18 @@ def scaled_fp8_quant(
     else:
         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+        if use_per_token_if_dynamic:
+            scale = torch.empty((input.numel() // input.shape[-1], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale, scale_ub)
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+
     return output, scale
 
 
@@ -361,11 +411,12 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
                                              value_cache, slot_mapping,
-                                             kv_cache_dtype, kv_scale)
+                                             kv_cache_dtype, k_scale, v_scale)
 
 
 def reshape_and_cache_flash(
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 99a875c9b3fb7..b4721b4e1aedd 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -59,7 +59,8 @@ def paged_attention_v1(
         max_context_len: int,
         alibi_slopes: Optional[torch.Tensor],
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -99,7 +100,8 @@ def paged_attention_v2(
         max_context_len: int,
         alibi_slopes: Optional[torch.Tensor],
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -227,7 +229,8 @@ def reshape_and_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
     ) -> None:
         assert kv_cache_dtype == "auto"
         ipex.llm.modules.PagedAttention.reshape_and_cache(
diff --git a/vllm/adapter_commons/__init__.py b/vllm/adapter_commons/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
new file mode 100644
index 0000000000000..3ed60678b52f5
--- /dev/null
+++ b/vllm/adapter_commons/layers.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass
+class AdapterMapping:
+    # Per every token in input_ids:
+    index_mapping: Tuple[int, ...]
+    # Per sampled token:
+    prompt_mapping: Tuple[int, ...]
+
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
\ No newline at end of file
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
new file mode 100644
index 0000000000000..6939b1405f3e1
--- /dev/null
+++ b/vllm/adapter_commons/models.py
@@ -0,0 +1,104 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+
+from torch import nn
+
+from vllm.logger import init_logger
+from vllm.utils import LRUCache
+
+logger = init_logger(__name__)
+
+
+class AdapterModel(ABC):
+
+    def __init__(self, model_id=None):
+        self.id = model_id
+
+    @abstractmethod
+    def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
+        # Common initialization code
+        # Load weights or embeddings from local checkpoint
+        raise NotImplementedError("Subclasses must implement this method.")
+
+
+T = TypeVar('T')
+
+
+class AdapterLRUCache(LRUCache[T]):
+
+    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
+                                                              None]):
+        super().__init__(capacity)
+        self.deactivate_fn = deactivate_fn
+
+    def _on_remove(self, key: Hashable, value: T):
+        logger.debug("Removing adapter int id: %d", key)
+        self.deactivate_fn(key)
+        return super()._on_remove(key, value)
+
+
+class AdapterModelManager(ABC):
+
+    def __init__(
+        self,
+        model: nn.Module,
+    ):
+        """Create a AdapterModelManager and adapter for a given model.
+        Args:
+            model: the model to be adapted.
+        """
+        self.model: nn.Module = model
+        self._registered_adapters: Dict[int, Any] = {}
+        # Dict instead of a Set for compatibility with LRUCache.
+        self._active_adapters: Dict[int, None] = {}
+        self.adapter_type = 'Adapter'
+        self._last_mapping = None
+
+    def __len__(self) -> int:
+        return len(self._registered_adapters)
+
+    @property
+    @abstractmethod
+    def adapter_slots(self):
+        ...
+
+    @property
+    @abstractmethod
+    def capacity(self):
+        ...
+
+    @abstractmethod
+    def activate_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def add_adapter(self, adapter: Any) -> bool:
+        ...
+
+    @abstractmethod
+    def set_adapter_mapping(self, mapping: Any) -> None:
+        ...
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_all_adapters(self):
+        ...
+
+    @abstractmethod
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        ...
+
+    @abstractmethod
+    def list_adapters(self) -> Dict[int, Any]:
+        ...
+
+    @abstractmethod
+    def pin_adapter(self, adapter_id: int) -> bool:
+        ...
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
new file mode 100644
index 0000000000000..69775ab7d4548
--- /dev/null
+++ b/vllm/adapter_commons/request.py
@@ -0,0 +1,25 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+
+
+@dataclass
+class AdapterRequest:
+    """
+    Base class for adapter requests.
+    """
+
+    @property
+    @abstractmethod
+    def adapter_id(self):
+        ...
+
+    def __post_init__(self):
+        if self.adapter_id < 1:
+            raise ValueError(f"id must be > 0, got {self.adapter_id}")
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(
+            value, self.__class__) and self.adapter_id == value.adapter_id
+
+    def __hash__(self) -> int:
+        return hash(self.adapter_id)
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
new file mode 100644
index 0000000000000..6c5411f7d3d5c
--- /dev/null
+++ b/vllm/adapter_commons/utils.py
@@ -0,0 +1,90 @@
+from typing import Any, Callable, Dict, Optional, Set
+
+
+## model functions
+def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None],
+                       deactivate_func: Callable) -> bool:
+    if adapter_id in active_adapters:
+        deactivate_func(adapter_id)
+        active_adapters.pop(adapter_id)
+        return True
+    return False
+
+
+def add_adapter(adapter: Any, registered_adapters: Dict[int, Any],
+                capacity: int, add_func: Callable) -> bool:
+    if adapter.id not in registered_adapters:
+        if len(registered_adapters) >= capacity:
+            raise RuntimeError('No free adapter slots.')
+        add_func(adapter)
+        registered_adapters[adapter.id] = adapter
+        return True
+    return False
+
+
+def set_adapter_mapping(mapping: Any, last_mapping: Any,
+                        set_mapping_func: Callable) -> Any:
+    if last_mapping != mapping:
+        set_mapping_func(mapping)
+        return mapping
+    return last_mapping
+
+
+def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any],
+                   deactivate_func: Callable) -> bool:
+    deactivate_func(adapter_id)
+    return bool(registered_adapters.pop(adapter_id, None))
+
+
+def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]:
+    return dict(registered_adapters)
+
+
+def get_adapter(adapter_id: int,
+                registered_adapters: Dict[int, Any]) -> Optional[Any]:
+    return registered_adapters.get(adapter_id, None)
+
+
+## worker functions
+def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any],
+                               apply_adapters_func,
+                               set_adapter_mapping_func) -> None:
+    apply_adapters_func(requests)
+    set_adapter_mapping_func(mapping)
+
+
+def add_adapter_worker(adapter_request: Any, list_adapters_func,
+                       load_adapter_func, add_adapter_func,
+                       activate_adapter_func) -> bool:
+    if adapter_request.adapter_id in list_adapters_func():
+        return False
+    loaded_adapter = load_adapter_func(adapter_request)
+    loaded = add_adapter_func(loaded_adapter)
+    activate_adapter_func(loaded_adapter.id)
+    return loaded
+
+
+def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func,
+                          adapter_slots: int, remove_adapter_func,
+                          add_adapter_func) -> None:
+    models_that_exist = list_adapters_func()
+    models_map = {
+        adapter_request.adapter_id: adapter_request
+        for adapter_request in adapter_requests if adapter_request
+    }
+    if len(models_map) > adapter_slots:
+        raise RuntimeError(
+            f"Number of requested models ({len(models_map)}) is greater "
+            f"than the number of GPU model slots "
+            f"({adapter_slots}).")
+    new_models = set(models_map)
+    models_to_add = new_models - models_that_exist
+    models_to_remove = models_that_exist - new_models
+    for adapter_id in models_to_remove:
+        remove_adapter_func(adapter_id)
+    for adapter_id in models_to_add:
+        add_adapter_func(models_map[adapter_id])
+
+
+def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]:
+    return set(adapter_manager_list_adapters_func())
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
new file mode 100644
index 0000000000000..acf18993af6d7
--- /dev/null
+++ b/vllm/adapter_commons/worker_manager.py
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Set
+
+import torch
+
+
+class AbstractWorkerManager(ABC):
+
+    def __init__(self, device: torch.device):
+        self.device = device
+
+    @property
+    @abstractmethod
+    def is_enabled(self) -> bool:
+        ...
+
+    @abstractmethod
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        ...
+
+    @abstractmethod
+    def add_adapter(self, adapter_request: Any) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_all_adapters(self):
+        ...
+
+    @abstractmethod
+    def list_adapters(self) -> Set[int]:
+        ...
diff --git a/vllm/assets/__init__.py b/vllm/assets/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
new file mode 100644
index 0000000000000..18ca2fe638cbc
--- /dev/null
+++ b/vllm/assets/base.py
@@ -0,0 +1,11 @@
+from pathlib import Path
+
+import vllm.envs as envs
+
+
+def get_cache_dir():
+    """Get the path to the cache for storing downloaded assets."""
+    path = Path(envs.VLLM_ASSETS_CACHE)
+    path.mkdir(parents=True, exist_ok=True)
+
+    return path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
new file mode 100644
index 0000000000000..b865b1b3a5497
--- /dev/null
+++ b/vllm/assets/image.py
@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Literal
+
+from PIL import Image
+
+from vllm.connections import global_http_connection
+from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
+
+from .base import get_cache_dir
+
+
+@lru_cache
+def get_air_example_data_2_asset(filename: str) -> Image.Image:
+    """
+    Download and open an image from
+    ``s3://air-example-data-2/vllm_opensource_llava/``.
+    """
+    image_directory = get_cache_dir() / "air-example-data-2"
+    image_directory.mkdir(parents=True, exist_ok=True)
+
+    image_path = image_directory / filename
+    if not image_path.exists():
+        base_url = "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava"
+
+        global_http_connection.download_file(f"{base_url}/{filename}",
+                                             image_path,
+                                             timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+
+    return Image.open(image_path)
+
+
+@dataclass(frozen=True)
+class ImageAsset:
+    name: Literal["stop_sign", "cherry_blossom"]
+
+    @property
+    def pil_image(self) -> Image.Image:
+        return get_air_example_data_2_asset(f"{self.name}.jpg")
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index f6bce9a187c64..44bfae44cfddd 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,5 +1,6 @@
 from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 
@@ -7,6 +8,7 @@
     "Attention",
     "AttentionBackend",
     "AttentionMetadata",
+    "AttentionMetadataBuilder",
     "Attention",
     "get_attn_backend",
 ]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 6396103bf5efa..106b00cc1014c 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,10 +1,20 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
-from typing import (Any, Dict, Generic, List, Optional, Set, Tuple, Type,
-                    TypeVar)
+from enum import Enum, auto
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set,
+                    Tuple, Type, TypeVar)
 
 import torch
 
+if TYPE_CHECKING:
+    from vllm.worker.model_runner_base import ModelRunnerInputBuilderBase
+
+
+class AttentionType(Enum):
+    DECODER = auto()  # Decoder attention between previous layer Q/K/V
+    ENCODER = auto()  # Encoder attention between previous layer Q/K/V
+    ENCODER_DECODER = auto()  # Attention between dec. Q and enc. K/V
+
 
 class AttentionBackend(ABC):
     """Abstract class for attention backends."""
@@ -21,9 +31,23 @@ def get_impl_cls() -> Type["AttentionImpl"]:
 
     @staticmethod
     @abstractmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
         raise NotImplementedError
 
+    @classmethod
+    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
+        return cls.get_metadata_cls()(*args, **kwargs)
+
+    @staticmethod
+    @abstractmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        raise NotImplementedError
+
+    @classmethod
+    def make_metadata_builder(cls, *args,
+                              **kwargs) -> "AttentionMetadataBuilder":
+        return cls.get_builder_cls()(*args, **kwargs)
+
     @staticmethod
     @abstractmethod
     def get_kv_cache_shape(
@@ -99,6 +123,20 @@ def asdict_zerocopy(self,
 T = TypeVar("T", bound=AttentionMetadata)
 
 
+class AttentionMetadataBuilder(ABC, Generic[T]):
+    """Abstract class for attention metadata builders."""
+
+    @abstractmethod
+    def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int) -> T:
+        """Build attention metadata with on-device tensors."""
+        raise NotImplementedError
+
+
 class AttentionImpl(ABC, Generic[T]):
 
     @abstractmethod
@@ -123,6 +161,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index dce2b83615b7a..71954f864a9b4 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -4,7 +4,8 @@
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonMetadataBuilder
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn, get_head_sliding_step)
 from vllm.attention.ops.paged_attn import PagedAttention
@@ -90,8 +91,12 @@ def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
         return BlocksparseFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata":
-        return BlocksparseFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return BlocksparseFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]:
+        return BlocksparseFlashAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -244,6 +249,12 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
         return self._cached_decode_metadata
 
 
+class BlocksparseFlashAttentionMetadataBuilder(
+        CommonMetadataBuilder[BlocksparseFlashAttentionMetadata]):
+
+    _metadata_cls = BlocksparseFlashAttentionMetadata
+
+
 class BlocksparseFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -327,7 +338,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: BlocksparseFlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -340,6 +353,12 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "BlocksparseFlashAttentionImpl")
+
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -361,7 +380,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         if prefill_meta := attn_metadata.prefill_metadata:
@@ -398,7 +418,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank=self.tp_rank,
                 blocksparse_local_blocks=self.local_blocks,
                 blocksparse_vert_stride=self.vert_stride,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 1c48e2a0bb33d..b16a204c8f44e 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,13 +1,22 @@
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
 from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionType)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.utils import make_tensor_with_pad
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -25,8 +34,12 @@ def get_impl_cls() -> Type["FlashAttentionImpl"]:
         return FlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata":
-        return FlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -184,6 +197,170 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
         return self._cached_decode_metadata
 
 
+class FlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlashAttentionMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.use_v2_block_manager = (
+            input_builder.scheduler_config.use_v2_block_manager)
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if inter_data.prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                block_table = block_tables[seq_id][-curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(
+                is_prompt, query_len, context_len, self.sliding_window,
+                self.use_v2_block_manager)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors."""
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        logits_soft_cap = getattr(self.runner.model_config.hf_config,
+                                  "attn_logit_softcapping", None)
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "Please use Flashinfer backend for models with logits_soft_cap"
+                " (i.e., Gemma-2). Otherwise, the output might be wrong."
+                " Set Flashinfer backend by "
+                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
+
+        max_query_len = max(query_lens)
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size + cuda_graph_pad_size
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(self.block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device=device)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        context_lens_tensor = torch.tensor(self.context_lens,
+                                           dtype=torch.int,
+                                           device=device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
+        slot_mapping_tensor = torch.tensor(self.slot_mapping,
+                                           dtype=torch.long,
+                                           device=device)
+
+        return FlashAttentionMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+
+
 class FlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -256,7 +433,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -269,8 +448,15 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert kv_scale == 1.0, "kv_scale is not supported in FlashAttention."
+        assert k_scale == 1.0 and v_scale == 1.0, (
+            "key/v_scale is not supported in FlashAttention.")
 
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7b7959d257fac..9dac12d3b906d 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,14 +1,30 @@
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
+
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    from vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None
+    BatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
 
-import flashinfer
 import torch
-from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-from vllm_flash_attn import flash_attn_varlen_func
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionType)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 
 class FlashInferBackend(AttentionBackend):
@@ -22,8 +38,12 @@ def get_impl_cls() -> Type["FlashInferImpl"]:
         return FlashInferImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashInferMetadata":
-        return FlashInferMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
+        return FlashInferMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -40,14 +60,14 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        raise NotImplementedError
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        raise NotImplementedError
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
@@ -60,19 +80,16 @@ class FlashInferMetadata(AttentionMetadata):
     # requests only.
     max_prefill_seq_len: int
 
-    use_cuda_graph: bool = False
+    use_cuda_graph: bool = True
 
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
 
-    # Metadata for the prefill stage since we still
-    # use flash attention for prefill.
+    # Metadata for the prefill stage
     seq_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
-    # Metadata for the decode stage
-    # Workspace buffer required by the kernel, the buffer should not
-    # be allocated/deacollated by the FalshInfermetadata object.
-    workspace_buffer: Optional[torch.Tensor] = None
     # An example for paged_kv_indices, paged_kv_indptr:
     # request 1, page indices [0, 5, 8]
     # request 2, page indices [1, 6, 7]
@@ -98,6 +115,9 @@ class FlashInferMetadata(AttentionMetadata):
     page_size: Optional[int] = None
     # The data type of the paged kv cache
     data_type: torch.dtype = None
+    device: torch.device = torch.device("cuda")
+    # Only used by gemma2 model
+    logits_soft_cap: Optional[float] = None
 
     def __post_init__(self):
         # Refer to
@@ -109,13 +129,37 @@ def __post_init__(self):
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
 
-        # When using flashinfer, we are also creating the FlashInferMetadata,
-        # which will also call post_init by default, here we want to skip the
-        # post_init if it's the prefill phase.
-        if self.num_prefills == 0:
-            assert self.num_decode_tokens > 0
-            self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-                self.workspace_buffer, "NHD")
+    def begin_forward(self):
+        if self.num_prefill_tokens > 0:
+            if self.paged_kv_indices is None:
+                return
+
+            assert self.prefill_wrapper is not None
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            self.prefill_wrapper.end_forward()
+            self.prefill_wrapper.begin_forward(
+                self.query_start_loc, self.paged_kv_indptr,
+                self.paged_kv_indices, self.paged_kv_last_page_len,
+                self.num_qo_heads, self.num_kv_heads, self.head_dim,
+                self.page_size)
+        else:
+            if not self.use_cuda_graph:
+                assert self.paged_kv_indices is not None
+                assert self.paged_kv_indptr is not None
+                assert self.paged_kv_last_page_len is not None
+                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                    self.device)
+
+            assert self.decode_wrapper is not None
+            self.decode_wrapper.end_forward()
             self.decode_wrapper.begin_forward(
                 self.paged_kv_indptr,
                 self.paged_kv_indices,
@@ -133,8 +177,9 @@ def asdict_zerocopy(self,
                         ) -> Dict[str, Any]:
         if skip_fields is None:
             skip_fields = set()
-        # We need to skip the decode_wrapper field since it cannot be
+        # We need to skip the prefill/decode_wrapper field since it cannot be
         # broadcasted with nccl when TP is enabled.
+        skip_fields.add('prefill_wrapper')
         skip_fields.add('decode_wrapper')
         return super().asdict_zerocopy(skip_fields)
 
@@ -157,6 +202,226 @@ def decode_metadata(self) -> Optional["FlashInferMetadata"]:
         return self
 
 
+class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.use_v2_block_manager = (
+            input_builder.scheduler_config.use_v2_block_manager)
+
+        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
+        # for the precise definition of the following fields.
+        # An example:
+        # request 1, page indices [0, 5, 8]
+        # request 2, page indices [1, 6, 7]
+        # request 3, page indices [3, 4]
+        # paged_kv_indices is a concatenation of page indices of all requests:
+        # [0, 5, 8, 1, 6, 7, 3, 4]
+        # paged_kv_indptr is used to index into paged_kv_indices:
+        # [0, 3, 6, 8]
+        self.paged_kv_indices: List[int] = []
+        # 0 at the beginning of paged_kv_indptr indicates the start of the
+        # first request’s page indices in the paged_kv_indices list.
+        self.paged_kv_indptr: List[int] = [0]
+        # paged_kv_last_page_len is the length of the last page of each request
+        self.paged_kv_last_page_len: List[int] = []
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        computed_block_nums = inter_data.computed_block_nums
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if inter_data.prefix_cache_hit:
+                block_table = computed_block_nums
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                block_table = block_tables[seq_id][-curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            is_profile_run = is_block_tables_empty(block_tables)
+
+            # Compute slot mapping.
+            start_idx = compute_slot_mapping_start_idx(
+                is_prompt, query_len, context_len, self.sliding_window,
+                self.use_v2_block_manager)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
+            # and paged_kv_last_page_len for profile run because we will
+            # create dummy inputs.
+            if is_profile_run:
+                return
+
+            # Get the number of valid blocks based on sequence length.
+            # If seq_len = 16, block_size = 16,
+            # block_table_bound is 1 with 1 valid block.
+            # If seq_len = 15, block_size = 16,
+            # block_table_bound is 0 + 1 with 1 valid block.
+            block_table_bound = seq_len // self.block_size + 1 \
+                                if seq_len % self.block_size != 0 \
+                                else seq_len // self.block_size
+            block_table = block_tables[seq_id]
+            self.paged_kv_indices.extend(block_table[:block_table_bound])
+            self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                        block_table_bound)
+
+            last_page_len = seq_len % self.block_size
+            if last_page_len == 0:
+                last_page_len = self.block_size
+            self.paged_kv_last_page_len.append(last_page_len)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size + cuda_graph_pad_size
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(self.block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device=device)
+
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
+        slot_mapping_tensor = torch.tensor(self.slot_mapping,
+                                           dtype=torch.long,
+                                           device=device)
+
+        logits_soft_cap = getattr(self.runner.model_config.hf_config,
+                                  "attn_logit_softcapping", None)
+
+        if len(self.paged_kv_indptr) > 0:
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device="cpu",
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device="cpu",
+                                                  dtype=torch.int)
+            paged_kv_last_page_len_tensor = torch.tensor(
+                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_len_tensor = None
+
+        kv_cache_dtype = get_kv_cache_torch_dtype(
+            self.runner.kv_cache_dtype, self.runner.model_config.dtype)
+        return FlashInferMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            max_prefill_seq_len=max_prefill_seq_len,
+            block_tables=block_tables,
+            paged_kv_indptr=paged_kv_indptr_tensor,
+            paged_kv_indices=paged_kv_indices_tensor,
+            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
+            num_qo_heads=self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config),
+            num_kv_heads=self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config),
+            head_dim=self.runner.model_config.get_head_size(),
+            page_size=self.block_size,
+            seq_start_loc=seq_start_loc,
+            query_start_loc=query_start_loc,
+            device=device,
+            data_type=kv_cache_dtype,
+            use_cuda_graph=use_captured_graph,
+            logits_soft_cap=logits_soft_cap)
+
+
 class FlashInferImpl(AttentionImpl):
 
     def __init__(
@@ -168,6 +433,7 @@ def __init__(
         alibi_slopes: Optional[List[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -191,9 +457,17 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: FlashInferMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0, (
+            "key/v_scale is not supported in FlashInfer.")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
         num_tokens, hidden_size = query.shape
         query = query.view(-1, self.num_heads, self.head_size)
         key = key.view(-1, self.num_kv_heads, self.head_size)
@@ -217,10 +491,14 @@ def forward(
                 self.kv_cache_dtype,
             )
 
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
         if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            assert prefill_meta.block_tables is not None
-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache is None:
                 output = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -235,16 +513,19 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                 )
             else:
-                raise NotImplementedError(
-                    "Prefix caching is not supported with flashinfer yet.")
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+                output = prefill_meta.prefill_wrapper.forward(
+                    query,
+                    kv_cache,
+                    logits_soft_cap=attn_metadata.logits_soft_cap,
+                    causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
-            query = query.contiguous(
-            )  # Flashinfer requires query to be contiguous
             output = attn_metadata.decode_metadata.decode_wrapper.forward(
                 query,
                 kv_cache,
                 sm_scale=self.scale,
-            )
+                logits_soft_cap=attn_metadata.logits_soft_cap)
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f09b24f2a0304..4559dd15f600c 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -7,7 +7,7 @@
 
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 
@@ -25,8 +25,8 @@ def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
         return IpexAttnBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "IpexAttnMetadata":
-        return IpexAttnMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
+        return IpexAttnMetadata
 
     @staticmethod
     def get_kv_cache_shape(
@@ -156,7 +156,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: IpexAttnMetadata,  # type: ignore
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
@@ -169,7 +171,12 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "IpexAttnBackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -186,7 +193,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         if attn_metadata.is_prompt:
@@ -267,7 +275,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             else:
                 # Run PagedAttention V2.
@@ -299,7 +308,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
new file mode 100644
index 0000000000000..0f21b50ad4dc7
--- /dev/null
+++ b/vllm/attention/backends/openvino.py
@@ -0,0 +1,101 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import openvino as ov
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
+
+
+class OpenVINOAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "openvino"
+
+    @staticmethod
+    def get_impl_cls():
+        # OpenVINO implements PagedAttention as part of the Optimum
+        # exported model
+        raise NotImplementedError
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+        raise NotImplementedError
+
+    @staticmethod
+    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
+        return OpenVINOAttentionMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, num_kv_heads, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: ov.Tensor,
+        dst_kv_cache: ov.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        # OpenVINO currently supports only CPU, which does not require
+        # swap of KV cache blocks
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        src_to_dists: List[Tuple[int, int]],
+    ) -> None:
+        for src, dst in src_to_dists:
+            for key_cache, value_cache in kv_caches:
+                key_cache.data[dst, :] = key_cache.data[src, :]
+                value_cache.data[dst, :] = value_cache.data[src, :]
+
+
+@dataclass
+class OpenVINOAttentionMetadata:
+    """Metadata for OpenVINOAttentionBackend.
+
+    Basic terms used below:
+    - batch_size_in_sequences - total number of sequences to execute​
+    - prompt_lens – per sequence size number of scheduled tokens​
+    - batch_size_in_tokens = sum(prompt_lens)​
+    - max_context_len = max(context_lens)​
+    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​
+    - num_blocks – total number of blocks in block_indices​
+    """
+
+    # Describes past KV cache size for each sequence within a batch
+    # Shape: [batch_size_in_sequences]
+    # Type: i32​
+    past_lens: torch.Tensor
+
+    # Describes start indices of input / speculative tokens from
+    # current sequences within a batch sequence​
+    # Shape: [batch_size_in_sequences + 1]​
+    # Type: i32
+    subsequence_begins: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # indices along 0th dimension in key_cache and value_cache inputs​
+    # Shape: [num_blocks]
+    # Type: i32​
+    block_indices: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # for i-th element, it is an index in block_indices with the
+    # first block belonging to i-th sequence​
+    # Shape: [batch_size_in_sequences + 1]
+    # Type: i32​
+    block_indices_begins: torch.Tensor
+
+    # Describes max context length
+    # Shape: scalar
+    # Type: i32
+    max_context_len: torch.Tensor
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b203c5ec54c92..b83a83bb177d4 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -6,7 +6,7 @@
 import torch_xla.experimental.dynamo_set_buffer_donor
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -16,8 +16,8 @@ def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
         return PallasAttentionBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "PallasMetadata":
-        return PallasMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["PallasMetadata"]:
+        return PallasMetadata
 
     @staticmethod
     def get_kv_cache_shape(
@@ -32,17 +32,22 @@ def get_kv_cache_shape(
     def swap_blocks(
         src_kv_cache: torch.Tensor,
         dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
+        src_to_dst: torch.Tensor,
     ) -> None:
-        raise NotImplementedError("swap_blocks is not implemented.")
+        raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
+    @torch.compile(backend="openxla")
     @staticmethod
     def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
     ) -> None:
-        # TODO(woosuk): Implement this.
-        raise NotImplementedError("copy_blocks is not implemented.")
+        src_indices, dst_indices = src_to_dists
+        for k_cache, v_cache in kv_caches:
+            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
+            k_cache[:, dst_indices] = k_cache[:, src_indices]
+            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
+            v_cache[:, dst_indices] = v_cache[:, src_indices]
 
 
 @dataclass
@@ -111,7 +116,7 @@ def __init__(
 
         self.megacore_mode = None
         tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
-        if not tpu_type.endswith("lite"):
+        if "lite" not in tpu_type:
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"
             else:
@@ -126,7 +131,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]],
         attn_metadata: PallasMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -140,7 +147,12 @@ def forward(
         Returns:
             shape = [batch_size, seq_len, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 9294068c64d1a..058c8df0eaf8b 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -6,7 +6,8 @@
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonMetadataBuilder
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -25,8 +26,12 @@ def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]:
         return ROCmFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata":
-        return ROCmFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return ROCmFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]:
+        return ROCmFlashAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -166,6 +171,43 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
         return self._cached_decode_metadata
 
 
+class ROCmFlashAttentionMetadataBuilder(
+        CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
+
+    _metadata_cls = ROCmFlashAttentionMetadata
+
+
+def _make_alibi_bias(alibi_slopes: torch.Tensor,
+                     dtype: torch.dtype,
+                     seq_lens: Optional[List[int]],
+                     make_attn_mask: bool = True) -> List[torch.Tensor]:
+    attn_biases = []
+    if seq_lens:
+        for seq_len in seq_lens:
+            bias = torch.arange(seq_len, dtype=dtype)
+            # NOTE(zhuohan): HF uses
+            #     `bias = bias[None, :].repeat(seq_len, 1)`
+            # here. We find that both biases give the same results, but
+            # the bias below more accurately follows the original ALiBi
+            # paper.
+            bias = bias[None, :] - bias[:, None]
+
+            num_heads = alibi_slopes.shape[0]
+            bias = bias[None, :].repeat(
+                (num_heads, 1, 1)).to(alibi_slopes.device)
+            bias.mul_(alibi_slopes[:, None, None])
+            if make_attn_mask:
+                inf_mask = torch.empty(
+                    (1, seq_len, seq_len),
+                    dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to(
+                        alibi_slopes.device)
+                attn_biases.append((bias + inf_mask).to(dtype))
+            else:
+                attn_biases.append(bias.to(dtype))
+
+    return attn_biases
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -233,6 +275,12 @@ def __init__(
                 triton_attention)
             self.attn_func = triton_attention
             logger.debug("Using Triton FA in ROCmBackend")
+            if self.sliding_window != (-1, -1):
+                logger.warning("ROCm Triton FA does not currently support "
+                               "sliding window attention. If using half "
+                               "precision, please try using the ROCm CK "
+                               "FA backend instead by setting the env var "
+                               "`VLLM_USE_TRITON_FLASH_ATTN=0`")
         else:
             # if not using triton, navi3x/navi21/navi10 do not use flash-attn
             # either
@@ -265,7 +313,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: ROCmFlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -278,6 +328,12 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmFlashAttentionImpl")
+
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -298,7 +354,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         num_prefill_tokens = attn_metadata.num_prefill_tokens
@@ -324,7 +381,14 @@ def forward(
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
+                attn_masks = None
                 if self.use_triton_flash_attn:
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            attn_metadata.seq_lens,
+                            make_attn_mask=False)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
@@ -336,12 +400,20 @@ def forward(
                         prefill_meta.max_prefill_seq_len,
                         True,
                         self.scale,
+                        attn_masks[0][None]
+                        if attn_masks is not None else None,
                     )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
                         # Interleave for MQA workaround.
                         key = self.repeat_kv(key, self.num_queries_per_kv)
                         value = self.repeat_kv(value, self.num_queries_per_kv)
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            attn_metadata.seq_lens,
+                            make_attn_mask=True)  # type: ignore
                     query = query.movedim(0, query.dim() - 2)
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
@@ -355,6 +427,7 @@ def forward(
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        attn_masks,
                     )
                 else:
                     out = self.attn_func(
@@ -367,6 +440,8 @@ def forward(
                         max_seqlen_k=prefill_meta.max_prefill_seq_len,
                         softmax_scale=self.scale,
                         causal=True,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
                     )
 
                 # common code for prefill
@@ -402,7 +477,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
@@ -418,13 +494,14 @@ def _sdpa_attention(
     num_heads: int,
     head_size: int,
     scale: float,
+    attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
     output = torch.empty((num_tokens, num_heads, head_size),
                          dtype=query.dtype,
                          device=query.device)
 
-    for seq_len in seq_lens:
+    for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
         with torch.backends.cuda.sdp_kernel(enable_math=True,
                                             enable_flash=False,
@@ -434,7 +511,8 @@ def _sdpa_attention(
                 key[:, start:end, :],
                 value[:, start:end, :],
                 dropout_p=0.0,
-                is_causal=True,
+                is_causal=attn_masks is None,
+                attn_mask=attn_masks[i] if attn_masks else None,
                 scale=scale).movedim(query.dim() - 2, 0)
             output[start:end, :, :] = sub_out
             start = end
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c01e0a0a3a19c..fe6a56123ce72 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,7 +7,7 @@
 from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
 from vllm.utils import is_cpu
 
@@ -31,8 +31,8 @@ def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
         return TorchSDPABackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata":
-        return TorchSDPAMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TorchSDPAMetadata
 
     @staticmethod
     def get_kv_cache_shape(
@@ -144,7 +144,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: TorchSDPAMetadata,  # type: ignore
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -157,7 +159,12 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TorchSDPABackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -170,7 +177,8 @@ def forward(
             PagedAttention.write_to_paged_cache(key, value, key_cache,
                                                 value_cache,
                                                 attn_metadata.slot_mapping,
-                                                self.kv_cache_dtype, kv_scale)
+                                                self.kv_cache_dtype, k_scale,
+                                                v_scale)
 
         if attn_metadata.is_prompt:
             assert attn_metadata.seq_lens is not None
@@ -233,7 +241,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
new file mode 100644
index 0000000000000..5877712b9b7d3
--- /dev/null
+++ b/vllm/attention/backends/utils.py
@@ -0,0 +1,237 @@
+"""Attention backend utils"""
+from typing import TYPE_CHECKING, Dict, List, Type, TypeVar, Union
+
+import torch
+
+from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
+from vllm.utils import make_tensor_with_pad
+
+# Error string(s) for encoder/decoder
+# unsupported attention scenarios
+STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
+                                 "with encoder/decoder models.")
+
+PAD_SLOT_ID = -1
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_block_tables_empty(block_tables: Union[None, Dict]):
+    """
+    Check if block_tables is None or a dictionary with all None values.
+    """
+    if block_tables is None:
+        return True
+    if isinstance(block_tables, dict) and all(
+            value is None for value in block_tables.values()):
+        return True
+    return False
+
+
+def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int,
+                                   context_len: int, sliding_window: int,
+                                   use_v2_block_manager: bool):
+    """
+    Compute the start index of slot mapping.
+    """
+    start_idx = 0
+    if is_prompt and sliding_window is not None:
+        assert use_v2_block_manager or context_len == 0, (
+            "Prefix caching is currently not supported with "
+            "sliding window attention in V1 block manager")
+        # When prefill, we use it to not write slots to kv cache
+        # to save memory.
+        start_idx = max(0, query_len - sliding_window)
+    return start_idx
+
+
+def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int],
+                         seq_id: int, seq_len: int, context_len: int,
+                         start_idx: int, block_size: int,
+                         block_tables: Dict[int, List[int]]):
+    """
+    Compute slot mapping.
+    """
+    if is_profile_run:
+        # During memory profiling, the block tables are not
+        # initialized yet. In this case, we just use a dummy
+        # slot mapping.
+        # In embeddings, the block tables are {seq_id: None}.
+        slot_mapping.extend([PAD_SLOT_ID] * seq_len)
+        return
+
+    # Mask the [0, start_idx) tokens of the prompt with
+    # PAD_SLOT_ID, where start_idx is max(0, seq_len -
+    # sliding_window). For example, if the prompt len is 10,
+    # sliding window is 8, and block size is 4, the first two
+    # tokens are masked and the slot mapping will be
+    # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+    block_table = block_tables[seq_id]
+    slot_mapping.extend([PAD_SLOT_ID] * max(0, start_idx - context_len))
+    for i in range(max(start_idx, context_len), seq_len):
+        block_number = block_table[i // block_size]
+        block_offset = i % block_size
+        slot = block_number * block_size + block_offset
+        slot_mapping.append(slot)
+
+
+TAttentionMetadata = TypeVar("TAttentionMetadata", bound='AttentionMetadata')
+
+
+class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
+
+    _metadata_cls: Type[TAttentionMetadata]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.use_v2_block_manager = (
+            input_builder.scheduler_config.use_v2_block_manager)
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        computed_block_nums = inter_data.computed_block_nums
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if inter_data.prefix_cache_hit:
+                block_table = computed_block_nums
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                block_table = block_tables[seq_id][-curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(
+                is_prompt, query_len, context_len, self.sliding_window,
+                self.use_v2_block_manager)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        logits_soft_cap = getattr(self.runner.model_config.hf_config,
+                                  "attn_logit_softcapping", None)
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "Please use Flashinfer backend for models with logits_soft_cap "
+                "(i.e., Gemma-2). Otherwise, the output might be wrong. "
+                "Set Flashinfer backend by "
+                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
+
+        max_query_len = max(query_lens)
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size + cuda_graph_pad_size
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(self.block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device=device)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+
+        context_lens_tensor = torch.tensor(self.context_lens,
+                                           dtype=torch.int,
+                                           device=device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=device)
+        torch.cumsum(seq_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
+        slot_mapping_tensor = torch.tensor(self.slot_mapping,
+                                           dtype=torch.long,
+                                           device=device)
+
+        return self._metadata_cls(  # type: ignore
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 0fecd9f6e610c..1573cd7da94cd 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -6,10 +6,12 @@
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import (AttentionBias,
                                          BlockDiagonalCausalMask,
+                                         BlockDiagonalMask,
                                          LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonMetadataBuilder
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -28,8 +30,12 @@ def get_impl_cls() -> Type["XFormersImpl"]:
         return XFormersImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "XFormersMetadata":
-        return XFormersMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return XFormersMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["XFormersMetadataBuilder"]:
+        return XFormersMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -66,11 +72,6 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     dynamically, it should be stored in tensor. The tensor has to be
     updated from `CUDAGraphRunner.forward` API.
     """
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
 
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -79,8 +80,9 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # |-------------------- seq_len ----------------------|
     #                                   |-- query_len ---|
 
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
     # FIXME: It is for flash attn.
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
@@ -88,26 +90,55 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # Maximum sequence length among decode batch. 0 if there are prefill
     # requests only.
     max_decode_seq_len: int
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+
     # FIXME: It is for flash attn.
     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
+    seq_start_loc: Optional[torch.Tensor] = None
+
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
-    context_lens_tensor: Optional[torch.Tensor]
+    context_lens_tensor: Optional[torch.Tensor] = None
 
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Self-attention prefill/decode metadata cache
     _cached_prefill_metadata: Optional["XFormersMetadata"] = None
     _cached_decode_metadata: Optional["XFormersMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     def __post_init__(self):
         # Set during the execution of the first attention op.
         # It is a list because it is needed to set per prompt
@@ -115,6 +146,28 @@ def __post_init__(self):
         # from xformer API.
         # will not appear in the __repr__ and __init__
         self.attn_bias: Optional[List[AttentionBias]] = None
+        self.encoder_attn_bias: Optional[List[AttentionBias]] = None
+        self.cross_attn_bias: Optional[List[AttentionBias]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return ((self.encoder_seq_lens is not None)
+                and (self.encoder_seq_lens_tensor is not None)
+                and (self.max_encoder_seq_len is not None))
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return (self.is_all_encoder_attn_metadata_set
+                and (self.cross_slot_mapping is not None)
+                and (self.cross_block_tables is not None))
 
     @property
     def prefill_metadata(self) -> Optional["XFormersMetadata"]:
@@ -122,30 +175,50 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             return None
 
         if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
             return self._cached_prefill_metadata
 
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
-        assert self.block_tables is not None
-
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        # Construct & cache prefill-phase attention metadata structure
         self._cached_prefill_metadata = XFormersMetadata(
             num_prefills=self.num_prefills,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
-            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=None,
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
-            block_tables=self.block_tables[:self.num_prefills],
+            query_start_loc=query_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -154,29 +227,151 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
             return None
 
         if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
             return self._cached_decode_metadata
-        assert self.block_tables is not None
-        assert self.seq_lens_tensor is not None
-
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        # Construct & cache decode-phase attention metadata structure
         self._cached_decode_metadata = XFormersMetadata(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
-            seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
-            max_query_len=None,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=seq_lens_tensor,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=self.block_tables[self.num_prefills:],
+            block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
 
+def _get_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_type: AttentionType,
+) -> Optional[AttentionBias]:
+    '''
+    Extract appropriate attention bias from attention metadata
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+    * Appropriate attention bias value given the attention type
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        return attn_metadata.attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        return attn_metadata.encoder_attn_bias
+    else:
+        # attn_type == AttentionType.ENCODER_DECODER
+        return attn_metadata.cross_attn_bias
+
+
+def _set_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_bias: List[Optional[AttentionBias]],
+    attn_type: AttentionType,
+) -> None:
+    '''
+    Update appropriate attention bias field of attention metadata,
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_bias: The desired attention bias value
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        attn_metadata.attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        attn_metadata.encoder_attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        attn_metadata.cross_attn_bias = attn_bias
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_seq_len_block_table_args(
+    attn_metadata: XFormersMetadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    '''
+    The particular choice of sequence-length- and block-table-related
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                  cross-attn block-tables fields
+    Encoder attn -> select encoder sequence lengths fields & no block tables
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * is_prompt: True if prefill, False otherwise
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensor
+    * Appropriate max sequence-length scalar
+    * Appropriate block tables (or None)
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_lens_tensor, max_seq_len,
+                attn_metadata.block_tables)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.cross_block_tables)
+    elif attn_type == AttentionType.ENCODER:
+        # No block tables associated with encoder attention
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len, None)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
+
+    _metadata_cls = XFormersMetadata
+
+
 class XFormersImpl(AttentionImpl[XFormersMetadata]):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -238,51 +433,145 @@ def __init__(
     def forward(
         self,
         query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
         kv_cache: Optional[torch.Tensor],
         attn_metadata: "XFormersMetadata",
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * XFormersImpl.forward() may be invoked for both self- and cross-
+          attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+    
         Args:
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
             kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
 
-        if kv_cache is not None:
+        # Check that appropriate attention metadata attributes are
+        # selected for the desired attention type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        # Self-attention vs. cross-attention will impact
+        # which KV cache memory-mapping & which
+        # seqlen datastructures we utilize
+
+        if (attn_type != AttentionType.ENCODER and kv_cache is not None):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache,
-                                                attn_metadata.slot_mapping,
-                                                self.kv_cache_dtype, kv_scale)
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+            if (key is not None) and (value is not None):
+
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                # Reshape the input keys and values and store them in the cache.
+                # If kv_cache is not provided, the new key and value tensors are
+                # not cached. This happens during the initial memory
+                # profiling run.
+                PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                    value_cache,
+                                                    updated_slot_mapping,
+                                                    self.kv_cache_dtype,
+                                                    k_scale, v_scale)
+
+        if attn_type != AttentionType.ENCODER:
+            # Decoder self-attention supports chunked prefill.
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
+        else:
+            # Encoder attention - chunked prefill is not applicable;
+            # derive token-count from query shape & and treat them
+            # as 100% prefill tokens
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_decode_tokens = 0
+
+        if attn_type == AttentionType.DECODER:
+            # Only enforce this shape-constraint for decoder
+            # self-attention
+            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_tokens]
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
+        if key is not None and value is not None:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
@@ -294,10 +583,14 @@ def forward(
                 # block tables are empty if the prompt does not have a cached
                 # prefix.
                 out = self._run_memory_efficient_xformers_forward(
-                    query, key, value, prefill_meta)
+                    query, key, value, prefill_meta, attn_type=attn_type)
                 assert out.shape == output[:num_prefill_tokens].shape
                 output[:num_prefill_tokens] = out
             else:
+
+                assert prefill_meta.query_start_loc is not None
+                assert prefill_meta.max_query_len is not None
+
                 # prefix-enabled attention
                 # TODO(Hai) this triton kernel has regression issue (broke) to
                 # deal with different data types between KV and FP8 KV cache,
@@ -320,18 +613,26 @@ def forward(
                 output[:num_prefill_tokens] = out
 
         if decode_meta := attn_metadata.decode_metadata:
+
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = _get_seq_len_block_table_args(decode_meta, False, attn_type)
+
             output[num_prefill_tokens:] = PagedAttention.forward_decode(
                 decode_query,
                 key_cache,
                 value_cache,
-                decode_meta.block_tables,
-                decode_meta.seq_lens_tensor,
-                decode_meta.max_decode_seq_len,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
                 self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
@@ -343,6 +644,7 @@ def _run_memory_efficient_xformers_forward(
         key: torch.Tensor,
         value: torch.Tensor,
         attn_metadata: XFormersMetadata,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Attention for 1D query of multiple prompts. Multiple prompt
         tokens are flattened in to `query` input.
@@ -356,8 +658,12 @@ def _run_memory_efficient_xformers_forward(
             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         """
-        assert attn_metadata.seq_lens is not None
+
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
@@ -375,18 +681,39 @@ def _run_memory_efficient_xformers_forward(
         # Set attention bias if not provided. This typically happens at
         # the very attention layer of every iteration.
         # FIXME(woosuk): This is a hack.
-        if attn_metadata.attn_bias is None:
+        attn_bias = _get_attn_bias(attn_metadata, attn_type)
+        if attn_bias is None:
             if self.alibi_slopes is None:
-                attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                    attn_metadata.seq_lens)
+                if (attn_type == AttentionType.ENCODER_DECODER):
+                    assert attn_metadata.seq_lens is not None
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Default enc/dec cross-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.seq_lens, attn_metadata.encoder_seq_lens)
+                elif attn_type == AttentionType.ENCODER:
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Default encoder self-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.encoder_seq_lens)
+                else:
+                    assert attn_metadata.seq_lens is not None
+
+                    # Default decoder self-attention mask is causal
+                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                        attn_metadata.seq_lens)
                 if self.sliding_window is not None:
                     attn_bias = attn_bias.make_local_attention(
                         self.sliding_window)
-                attn_metadata.attn_bias = [attn_bias]
+                attn_bias = [attn_bias]
             else:
-                attn_metadata.attn_bias = _make_alibi_bias(
-                    self.alibi_slopes, self.num_kv_heads, query.dtype,
-                    attn_metadata.seq_lens)
+                assert attn_metadata.seq_lens is not None
+                attn_bias = _make_alibi_bias(self.alibi_slopes,
+                                             self.num_kv_heads, query.dtype,
+                                             attn_metadata.seq_lens)
+
+            _set_attn_bias(attn_metadata, attn_bias, attn_type)
 
         # No alibi slopes.
         # TODO(woosuk): Too many view operations. Let's try to reduce
@@ -400,7 +727,7 @@ def _run_memory_efficient_xformers_forward(
                 query,
                 key,
                 value,
-                attn_bias=attn_metadata.attn_bias[0],
+                attn_bias=attn_bias[0],
                 p=0.0,
                 scale=self.scale)
             return out.view_as(original_query)
@@ -409,6 +736,7 @@ def _run_memory_efficient_xformers_forward(
         # FIXME(woosuk): Because xformers does not support dynamic sequence
         # lengths with custom attention bias, we process each prompt one by
         # one. This is inefficient, especially when we have many short prompts.
+        assert attn_metadata.seq_lens is not None
         output = torch.empty_like(original_query)
         start = 0
         for i, seq_len in enumerate(attn_metadata.seq_lens):
@@ -417,7 +745,7 @@ def _run_memory_efficient_xformers_forward(
                 query[None, start:end],
                 key[None, start:end],
                 value[None, start:end],
-                attn_bias=attn_metadata.attn_bias[i],
+                attn_bias=attn_bias[i],
                 p=0.0,
                 scale=self.scale)
             # TODO(woosuk): Unnecessary copy. Optimize.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index db55a31476fed..5fa552f2f4eca 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -4,11 +4,12 @@
 import torch
 import torch.nn as nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata, AttentionType
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 
 
 class Attention(nn.Module):
@@ -33,6 +34,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         if cache_config is not None:
@@ -46,23 +48,27 @@ def __init__(
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
-        # The default kv_scale is set to 1.0. This is ignored
+        # The default k/v_scale is set to 1.0. This is ignored
         # when kv-cache is not fp8, and should be used with
         # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
-        # expect the pre-quantized kv_scale to be loaded along
+        # expect the pre-quantized k/v_scale to be loaded along
         # with the model weights.
         self.kv_cache_dtype = kv_cache_dtype
-        self._kv_scale = 1.0
+        self._k_scale = 1.0
+        self._v_scale = 1.0
         quant_method = quant_config.get_quant_method(
-            self) if quant_config else None
+            self, prefix=prefix) if quant_config else None
         if quant_method is not None:
+            assert isinstance(quant_method, BaseKVCacheMethod)
+            # TODO (mgoin): kv cache dtype should be specified in the FP8
+            # checkpoint config and become the "auto" behavior
             if self.kv_cache_dtype == "fp8_e5m2":
                 raise ValueError("fp8_e5m2 kv-cache is not supported with "
                                  "fp8 checkpoints.")
-            # When FP8 quantization is enabled, we make a parameter
-            # "kv_scale" so that it can be loaded from FP8 checkpoint.
-            # The kv_scale will then be converted back
-            # to self._kv_scale in a native float32 value after weight loading.
+            # If quantization is enabled, we make "k_scale" and "v_scale"
+            # parameters so that it can be loaded from the model checkpoint.
+            # The k/v_scale will then be converted back to native float32
+            # values after weight loading.
             self.quant_method = quant_method
             self.quant_method.create_weights(self)
 
@@ -85,9 +91,17 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        return self.impl.forward(query, key, value, kv_cache, attn_metadata,
-                                 self._kv_scale)
+
+        return self.impl.forward(query,
+                                 key,
+                                 value,
+                                 kv_cache,
+                                 attn_metadata,
+                                 self._k_scale,
+                                 self._v_scale,
+                                 attn_type=attn_type)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 300211e70bb79..e870a8e614d12 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -2,13 +2,14 @@
 
 import torch
 
+from vllm.platforms import current_platform
 from vllm.utils import is_cpu, is_hip
 
 from .utils import (dense_to_crow_col, get_head_sliding_step,
                     get_sparse_attn_mask)
 
 IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
-                         and torch.cuda.get_device_capability()[0] >= 8)
+                         and current_platform.get_device_capability()[0] >= 8)
 
 if IS_COMPUTE_8_OR_ABOVE:
     from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
@@ -235,4 +236,4 @@ def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
                                 v,
                                 cu_seqlens_k,
                                 cu_seqlens_q=cu_seqlens_q,
-                                sm_scale=sm_scale)
\ No newline at end of file
+                                sm_scale=sm_scale)
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index 0d90dd971e156..78d752230d6e7 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -4,9 +4,35 @@
 
 from functools import lru_cache
 
+import numpy as np
 import torch
 import triton
-from scipy import sparse
+
+
+class csr_matrix:
+    """Simple implementation of CSR matrix conversion without scipy.
+    This replaced scipy.sparse.csr_matrix() previously used."""
+
+    def __init__(self, input_array):
+        if not isinstance(input_array, np.ndarray):
+            raise ValueError("Input must be a NumPy array")
+
+        self.shape = input_array.shape
+        rows, cols = self.shape
+        data = []
+        indices = []
+        indptr = [0]
+
+        for i in range(rows):
+            for j in range(cols):
+                if input_array[i, j]:
+                    data.append(input_array[i, j])
+                    indices.append(j)
+            indptr.append(len(indices))
+
+        self.data = np.array(data)
+        self.indices = np.array(indices)
+        self.indptr = np.array(indptr)
 
 
 def dense_to_crow_col(x: torch.Tensor):
@@ -19,7 +45,7 @@ def dense_to_crow_col(x: torch.Tensor):
     assert x.dim() in (2, 3)
     if x.dim() == 2:
         x = x[None]
-    x = [sparse.csr_matrix(xi.bool().cpu().numpy()) for xi in x]
+    x = [csr_matrix(xi.bool().cpu().numpy()) for xi in x]
     crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x])
     cols = [torch.from_numpy(xi.indices) for xi in x]
     max_cols = max(len(xi) for xi in cols)
@@ -77,11 +103,11 @@ def _get_sparse_attn_mask_homo_head(
 ):
     """
     :return: a tuple of 3:
-        - tuple of crow_indices, col_indices representation 
+        - tuple of crow_indices, col_indices representation
             of CSR format.
         - block dense mask
-        - all token dense mask (be aware that it can be 
-            OOM if it is too big) if `return_dense==True`, 
+        - all token dense mask (be aware that it can be
+            OOM if it is too big) if `return_dense==True`,
             otherwise, None
     """
     with torch.no_grad():
@@ -148,10 +174,10 @@ def get_sparse_attn_mask(
     :param dense_mask_type: "binary" (0 for skip token, 1 for others)
         or "bias" (-inf for skip token, 0 or others)
     :return: a tuple of 3:
-        - tuple of crow_indices, col_indices representation 
+        - tuple of crow_indices, col_indices representation
             of CSR format.
         - block dense mask
-        - all token dense mask (be aware that it can be OOM if it 
+        - all token dense mask (be aware that it can be OOM if it
             is too big) if `return_dense==True`, otherwise, None
     """
     assert dense_mask_type in ("binary", "bias")
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 5a5317b65004e..81d308c4d4e22 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -45,7 +45,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         *args,
     ) -> None:
         ipex_modules.PagedAttention.reshape_and_cache(
@@ -64,7 +65,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         *args,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index a214f40d16514..ce7b4d129779c 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -66,7 +66,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
     ) -> None:
         ops.reshape_and_cache(
             key,
@@ -75,7 +76,8 @@ def write_to_paged_cache(
             value_cache,
             slot_mapping.flatten(),
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
 
     @staticmethod
@@ -90,7 +92,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -135,7 +138,8 @@ def forward_decode(
                 max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank,
                 blocksparse_local_blocks,
                 blocksparse_vert_stride,
@@ -172,7 +176,8 @@ def forward_decode(
                 max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank,
                 blocksparse_local_blocks,
                 blocksparse_vert_stride,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index b99cf9a50d105..4577d84db18ac 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+from vllm.platforms import current_platform
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -683,8 +685,14 @@ def context_attention_fwd(q,
                               alibi_slopes=None,
                               sliding_window=None):
 
-        cap = torch.cuda.get_device_capability()
+        cap = current_platform.get_device_capability()
         BLOCK = 128 if cap[0] >= 8 else 64
+
+        # need to reduce num. blocks when using fp32
+        # due to increased use of GPU shared memory
+        if q.dtype is torch.float32:
+            BLOCK = BLOCK // 2
+
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv
@@ -716,7 +724,7 @@ def context_attention_fwd(q,
                 b_ctx_len,
                 alibi_slopes,
                 v_cache.shape[3],
-                8,
+                k_cache.shape[4],
                 o,
                 b_loc.stride(0),
                 b_loc.stride(1),
@@ -766,7 +774,7 @@ def context_attention_fwd(q,
             b_seq_len,
             b_ctx_len,
             v_cache.shape[3],
-            8,
+            k_cache.shape[4],
             o,
             b_loc.stride(0),
             b_loc.stride(1),
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1d56d87ccd119..8fcd85585a18f 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,8 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
+from vllm.platforms import current_platform
+from vllm.utils import is_cpu, is_hip, is_openvino, is_tpu, is_xpu
 
 logger = init_logger(__name__)
 
@@ -17,6 +18,7 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
+    OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -64,6 +66,10 @@ def get_attn_backend(
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.OPENVINO:
+        logger.info("Using OpenVINO Attention backend.")
+        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
+        return OpenVINOAttentionBackend
     elif backend == _Backend.IPEX:
         assert is_xpu(), RuntimeError(
             "IPEX attention backend is only used for the XPU device.")
@@ -72,8 +78,6 @@ def get_attn_backend(
         return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
-        logger.warning("Eager mode is required for the Flashinfer backend. "
-                       "Please make sure --enforce-eager is set.")
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
     elif backend == _Backend.PALLAS:
@@ -113,6 +117,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_openvino():
+        if selected_backend != _Backend.OPENVINO:
+            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
+        return _Backend.OPENVINO
+
     if is_xpu():
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
@@ -128,7 +137,7 @@ def which_attn_to_use(
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
-            if torch.cuda.get_device_capability()[0] != 9:
+            if current_platform.get_device_capability()[0] != 9:
                 # not Instinct series GPUs.
                 logger.info("flash_attn is not supported on NAVI GPUs.")
         else:
@@ -137,7 +146,7 @@ def which_attn_to_use(
 
     # FlashAttn in NVIDIA GPUs.
     if selected_backend == _Backend.FLASH_ATTN:
-        if torch.cuda.get_device_capability()[0] < 8:
+        if current_platform.get_device_capability()[0] < 8:
             # Volta and Turing NVIDIA GPUs.
             logger.info(
                 "Cannot use FlashAttention-2 backend for Volta and Turing "
diff --git a/vllm/block.py b/vllm/block.py
index bd00c07adc0d7..0b8ef7d4b73d9 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -1,90 +1,10 @@
 """Token blocks."""
-import weakref
-from collections import defaultdict
-from typing import Dict, List
+from typing import List
 
 from vllm.utils import Device
 
-_BLANK_TOKEN_ID = -1
-
 DEFAULT_LAST_ACCESSED_TIME = -1
 
-TokensBlock = List[int]
-
-
-class BlockPool:
-    """A pool of logical blocks.
-    When requests come, we create a lot of logical blocks;
-    when requests are done, we destroy a lot of logical blocks.
-    It turns out that creating and destroying logical blocks can be expensive,
-    especially for the `token_ids` field, which is a list of integers.
-    To avoid this overhead, we use a pool to manage the logical blocks.
-    When an old request is done and a new request comes, we can reuse the
-    logical blocks from the old request to feed the new request.
-    """
-
-    def __init__(self) -> None:
-        # block size to list of token blocks
-        self.pool: Dict[int, List[TokensBlock]] = defaultdict(list)
-
-    def alloc_block(self, block_size: int) -> TokensBlock:
-        if block_size in self.pool and self.pool[block_size]:
-            return self.pool[block_size].pop()
-        return [_BLANK_TOKEN_ID] * block_size
-
-    def del_block(self, block: TokensBlock) -> None:
-        self.pool[len(block)].append(block)
-
-
-_BLOCK_POOL = BlockPool()
-
-
-class LogicalTokenBlock:
-    """A block that stores a contiguous chunk of tokens from left to right.
-
-    Logical blocks are used to represent the states of the corresponding
-    physical blocks in the KV cache.
-    """
-
-    def __init__(
-        self,
-        block_number: int,
-        block_size: int,
-    ) -> None:
-        self.block_number = block_number
-        self.block_size = block_size
-
-        self.token_ids = _BLOCK_POOL.alloc_block(block_size)
-        # this finalizer is used to return the block to the pool when the object is deleted # noqa
-        # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa
-        # i.e. `self.token_ids` may be deleted before `self`, and we lose
-        #  the opportunity to return the block to the pool
-        self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block,
-                                           self.token_ids)
-        self.num_tokens = 0
-
-    def is_empty(self) -> bool:
-        return self.num_tokens == 0
-
-    def get_num_empty_slots(self) -> int:
-        return self.block_size - self.num_tokens
-
-    def is_full(self) -> bool:
-        return self.num_tokens == self.block_size
-
-    def append_tokens(self, token_ids: List[int]) -> None:
-        assert len(token_ids) <= self.get_num_empty_slots()
-        curr_idx = self.num_tokens
-        self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
-        self.num_tokens += len(token_ids)
-
-    def get_token_ids(self) -> List[int]:
-        return self.token_ids[:self.num_tokens]
-
-    def get_last_token_id(self) -> int:
-        assert self.num_tokens > 0
-        return self.token_ids[self.num_tokens - 1]
-
 
 class PhysicalTokenBlock:
     """Represents the state of a block in the KV cache."""
diff --git a/vllm/config.py b/vllm/config.py
index 8d004902fe4ff..c27d26c098b59 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,11 +1,10 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
-                    Union)
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
 
 import torch
-from transformers import PretrainedConfig, PreTrainedTokenizerBase
+from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -13,18 +12,34 @@
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu, is_xpu)
+                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
+                        print_warning_once)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
 
+    from vllm.executor.executor_base import ExecutorBase
     from vllm.model_executor.model_loader.loader import BaseModelLoader
+    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+        BaseTokenizerGroup)
 
 logger = init_logger(__name__)
 
 _GB = 1 << 30
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 
+_PP_SUPPORTED_MODELS = [
+    "AquilaModel",
+    "AquilaForCausalLM",
+    "InternLMForCausalLM",
+    "LlamaForCausalLM",
+    "LLaMAForCausalLM",
+    "MistralForCausalLM",
+    "Phi3ForCausalLM",
+    "GPT2LMHeadModel",
+    "MixtralForCausalLM",
+]
+
 
 class ModelConfig:
     """Configuration for the model.
@@ -107,6 +122,7 @@ def __init__(
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
+        multimodal_config: Optional["MultiModalConfig"] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -125,12 +141,10 @@ def __init__(
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
-        self.max_context_len_to_capture = max_context_len_to_capture
-        if self.max_context_len_to_capture is not None:
+        if max_context_len_to_capture is not None:
             raise ValueError("`max_context_len_to_capture` is deprecated. "
                              "Use `max_seq_len_to_capture` instead.")
-        self.max_seq_len_to_capture = (max_seq_len_to_capture
-                                       or max_context_len_to_capture)
+        self.max_seq_len_to_capture = max_seq_len_to_capture
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
@@ -139,6 +153,17 @@ def __init__(
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+
+        if (not self.disable_sliding_window
+                and self.hf_text_config.model_type == "gemma2"
+                and self.hf_text_config.sliding_window is not None):
+            print_warning_once(
+                "Gemma 2 uses sliding window attention for every odd layer, "
+                "which is currently not supported by vLLM. Disabling sliding "
+                "window and capping the max length to the sliding window size "
+                f"({self.hf_text_config.sliding_window}).")
+            self.disable_sliding_window = True
+
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             max_model_len=max_model_len,
@@ -146,6 +171,8 @@ def __init__(
             sliding_window_len=self.get_hf_config_sliding_window())
         self.served_model_name = get_served_model_name(model,
                                                        served_model_name)
+        self.multimodal_config = multimodal_config
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_embedding_mode()
@@ -214,7 +241,8 @@ def _verify_quantization(self) -> None:
                     f"{self.quantization} quantization is currently not "
                     f"supported in ROCm.")
             if (self.quantization
-                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin")):
+                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin",
+                            "awq_marlin", "fbgemm_fp8", "compressed_tensors")):
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
@@ -239,14 +267,13 @@ def verify_with_parallel_config(
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        total_num_hidden_layers = getattr(self.hf_text_config,
-                                          "num_hidden_layers", 0)
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
-        if total_num_hidden_layers % pipeline_parallel_size != 0:
-            raise ValueError(
-                f"Total number of hidden layers ({total_num_hidden_layers}) "
-                "must be divisible by pipeline parallel size "
-                f"({pipeline_parallel_size}).")
+        architectures = getattr(self.hf_config, "architectures", [])
+        if not all(arch in _PP_SUPPORTED_MODELS
+                   for arch in architectures) and pipeline_parallel_size > 1:
+            raise NotImplementedError(
+                "Pipeline parallelism is only supported for the following "
+                f" architectures: {_PP_SUPPORTED_MODELS}.")
 
         if self.quantization == "bitsandbytes" and (
                 parallel_config.tensor_parallel_size > 1
@@ -255,8 +282,7 @@ def verify_with_parallel_config(
                 "BitAndBytes quantization with TP or PP is not supported yet.")
 
     def get_hf_config_sliding_window(self) -> Optional[int]:
-        """Get the sliding window size, or None if disabled.
-        """
+        """Get the sliding window size, or None if disabled."""
 
         # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
         # addition to sliding window size. We check if that field is present
@@ -282,6 +308,12 @@ def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
     def get_head_size(self) -> int:
+        # TODO remove hard code
+        if hasattr(self.hf_text_config, "model_type"
+                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+            # FlashAttention supports only head_size 32, 64, 128, 256,
+            # we need to pad head_size 192 to 256
+            return 256
         if hasattr(self.hf_text_config, "head_dim"):
             return self.hf_text_config.head_dim
         # FIXME(woosuk): This may not be true for all models.
@@ -347,8 +379,39 @@ def get_num_attention_heads(self,
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
-        return total_num_hidden_layers // parallel_config.pipeline_parallel_size
+        from vllm.distributed.utils import get_pp_indices
+        total_num_hidden_layers = getattr(self.hf_text_config,
+                                          "num_hidden_layers", 0)
+        pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
+        pp_size = parallel_config.pipeline_parallel_size
+        start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
+        return end - start
+
+    def contains_seqlen_agnostic_layers(
+            self, parallel_config: "ParallelConfig") -> bool:
+        """True for Mamba/SSM models (Jamba)"""
+        return self._get_num_seqlen_agnostic_layers(parallel_config) > 0
+
+    def get_layers_block_type(self,
+                              parallel_config: "ParallelConfig") -> List[str]:
+        num_layers = self.get_num_layers(parallel_config)
+        # Transformers supports layers_block_type @property
+        return getattr(self.hf_config, "layers_block_type",
+                       ["attention"] * num_layers)
+
+    def get_num_attention_layers(self,
+                                 parallel_config: "ParallelConfig") -> int:
+        return len([
+            t for t in self.get_layers_block_type(parallel_config)
+            if t == "attention"
+        ])
+
+    def _get_num_seqlen_agnostic_layers(
+            self, parallel_config: "ParallelConfig") -> int:
+        return len([
+            t for t in self.get_layers_block_type(parallel_config)
+            if t != "attention"
+        ])
 
 
 class CacheConfig:
@@ -373,6 +436,7 @@ def __init__(
         num_gpu_blocks_override: Optional[int] = None,
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
+        cpu_offload_gb: float = 0,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
@@ -381,6 +445,7 @@ def __init__(
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
+        self.cpu_offload_gb = cpu_offload_gb
         self._verify_args()
         self._verify_cache_dtype()
         self._verify_prefix_caching()
@@ -456,11 +521,12 @@ class TokenizerPoolConfig:
             pool type.
     """
     pool_size: int
-    pool_type: str
+    pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
 
     def __post_init__(self):
-        if self.pool_type not in ("ray", ):
+        if self.pool_type not in ("ray", ) and not isinstance(
+                self.pool_type, type):
             raise ValueError(f"Unknown pool type: {self.pool_type}")
         if not isinstance(self.extra_config, dict):
             raise ValueError("extra_config must be a dictionary.")
@@ -524,12 +590,16 @@ class LoadConfig:
                 mainly for profiling.
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
+        ignore_patterns: The list of patterns to ignore when loading the model.
+            Default to "original/**/*" to avoid repeated loading of llama's 
+            checkpoints.
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
     download_dir: Optional[str] = None
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None
 
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
@@ -538,6 +608,13 @@ def __post_init__(self):
                 model_loader_extra_config)
         self._verify_load_format()
 
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns)
+        else:
+            self.ignore_patterns = ["original/**/*"]
+
     def _verify_load_format(self) -> None:
         if not isinstance(self.load_format, str):
             return
@@ -590,7 +667,8 @@ def __init__(
         tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
         ray_workers_use_nsight: bool = False,
         placement_group: Optional["PlacementGroup"] = None,
-        distributed_executor_backend: Optional[str] = None,
+        distributed_executor_backend: Optional[Union[
+            str, Type["ExecutorBase"]]] = None,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         self.tensor_parallel_size = tensor_parallel_size
@@ -605,7 +683,7 @@ def __init__(
         if worker_use_ray:
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"
-            elif self.distributed_executor_backend != "ray":
+            elif not self.use_ray:
                 raise ValueError(f"worker-use-ray can't be used with "
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
@@ -616,11 +694,13 @@ def __init__(
 
             from vllm.executor import ray_utils
             backend = "mp"
-            ray_found = ray_utils.ray is not None
+            ray_found = ray_utils.ray_is_available()
             if cuda_device_count_stateless() < self.world_size:
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
-                                     "required for multi-node inference")
+                                     "required for multi-node inference, "
+                                     "please install Ray with `pip install "
+                                     "ray`.") from ray_utils.ray_import_err
                 backend = "ray"
             elif ray_found:
                 if self.placement_group:
@@ -636,28 +716,35 @@ def __init__(
                         backend)
 
         self._verify_args()
+        self.rank = 0
+
+    @property
+    def use_ray(self) -> bool:
+        return self.distributed_executor_backend == "ray" or (
+            isinstance(self.distributed_executor_backend, type)
+            and self.distributed_executor_backend.uses_ray)
 
     def _verify_args(self) -> None:
-        if self.pipeline_parallel_size > 1:
-            raise NotImplementedError(
-                "Pipeline parallelism is not supported yet.")
-        if self.distributed_executor_backend not in ("ray", "mp", None):
+        # Lazy import to avoid circular import
+        from vllm.executor.executor_base import ExecutorBase
+
+        if self.distributed_executor_backend not in (
+                "ray", "mp", None) and not (isinstance(
+                    self.distributed_executor_backend, type) and issubclass(
+                        self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
-                "Unrecognized distributed executor backend. Supported values "
-                "are 'ray' or 'mp'.")
-        if not self.disable_custom_all_reduce and self.world_size > 1:
-            if is_hip():
-                self.disable_custom_all_reduce = True
-                logger.info(
-                    "Disabled the custom all-reduce kernel because it is not "
-                    "supported on AMD GPUs.")
-            elif self.pipeline_parallel_size > 1:
-                self.disable_custom_all_reduce = True
-                logger.info(
-                    "Disabled the custom all-reduce kernel because it is not "
-                    "supported with pipeline parallelism.")
-        if self.ray_workers_use_nsight and (
-                not self.distributed_executor_backend == "ray"):
+                "Unrecognized distributed executor backend "
+                f"{self.distributed_executor_backend}. Supported "
+                "values are 'ray', 'mp' or custom ExecutorBase subclass.")
+        if self.use_ray:
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
+        if is_hip():
+            self.disable_custom_all_reduce = True
+            logger.info(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported on AMD GPUs.")
+        if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
 
@@ -716,7 +803,9 @@ def __init__(self,
                 # for higher throughput.
                 self.max_num_batched_tokens = max(max_model_len, 2048)
         if enable_chunked_prefill:
-            logger.info("Chunked prefill is enabled (EXPERIMENTAL).")
+            logger.info(
+                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
+                self.max_num_batched_tokens)
 
         self.max_num_seqs = max_num_seqs
         self.max_model_len = max_model_len
@@ -726,7 +815,6 @@ def __init__(self,
         self.chunked_prefill_enabled = enable_chunked_prefill
         self.embedding_mode = embedding_mode
         self.preemption_mode = preemption_mode
-
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -760,6 +848,8 @@ def __init__(self, device: str = "auto") -> None:
             # Automated device type detection
             if is_neuron():
                 self.device_type = "neuron"
+            elif is_openvino():
+                self.device_type = "openvino"
             elif is_tpu():
                 self.device_type = "tpu"
             elif is_cpu():
@@ -775,7 +865,7 @@ def __init__(self, device: str = "auto") -> None:
             self.device_type = device
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
+        if self.device_type in ["neuron", "openvino"]:
             self.device = torch.device("cpu")
         elif self.device_type in ["tpu"]:
             self.device = None
@@ -797,6 +887,7 @@ def maybe_create_spec_config(
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_draft_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -804,6 +895,10 @@ def maybe_create_spec_config(
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: Optional[float],
+        typical_acceptance_sampler_posterior_alpha: Optional[float],
+        disable_logprobs: Optional[bool],
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -819,6 +914,8 @@ def maybe_create_spec_config(
             target_dtype (str): The data type used for the target model.
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
+            speculative_draft_tensor_parallel_size (Optional[int]): The degree
+                of the tensor parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -838,7 +935,25 @@ def maybe_create_spec_config(
                 window, if provided.
             ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
                 window, if provided.
-
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
+            disable_logprobs (Optional[bool]): If set to True, token log
+                probabilities are not returned during speculative decoding.
+                If set to False, token log probabilities are returned
+                according to the log probability settings in SamplingParams.
+                If not specified, it defaults to True.
+    
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
                 the necessary conditions are met, else None.
@@ -909,15 +1024,13 @@ def maybe_create_spec_config(
                 max_logprobs=target_model_config.max_logprobs,
             )
 
-            if (draft_model_config.hf_config.model_type == "mlp_speculator"
-                    and target_parallel_config.world_size != 1):
-                # MLPSpeculator TP support will be added very soon
-                raise ValueError(
-                    "Speculative decoding with mlp_speculator models does not "
-                    "yet support distributed inferencing (TP > 1).")
+            draft_hf_config = draft_model_config.hf_config
 
-            n_predict = getattr(draft_model_config.hf_config, "n_predict",
-                                None)
+            if (num_speculative_tokens is not None
+                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
+                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
+
+            n_predict = getattr(draft_hf_config, "n_predict", None)
             if n_predict is not None:
                 if num_speculative_tokens is None:
                     # Default to max value defined in draft model config.
@@ -926,9 +1039,9 @@ def maybe_create_spec_config(
                     # Verify provided value doesn't exceed the maximum
                     # supported by the draft model.
                     raise ValueError(
-                        "Expected both speculative_model and "
-                        "num_speculative_tokens to be provided, but found "
-                        f"{speculative_model=} and {num_speculative_tokens=}.")
+                        "This speculative model supports a maximum of "
+                        f"num_speculative_tokens={n_predict}, but "
+                        f"{num_speculative_tokens=} was provided.")
 
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
@@ -939,7 +1052,8 @@ def maybe_create_spec_config(
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -947,6 +1061,13 @@ def maybe_create_spec_config(
                 "speculative_model unless the draft model config contains an "
                 "n_predict parameter.")
 
+        if typical_acceptance_sampler_posterior_threshold is None:
+            typical_acceptance_sampler_posterior_threshold = 0.09
+        if typical_acceptance_sampler_posterior_alpha is None:
+            typical_acceptance_sampler_posterior_alpha = 0.3
+        if disable_logprobs is None:
+            disable_logprobs = True
+
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
@@ -954,6 +1075,12 @@ def maybe_create_spec_config(
             speculative_disable_by_batch_size,
             ngram_prompt_lookup_max,
             ngram_prompt_lookup_min,
+            draft_token_acceptance_method=draft_token_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=\
+                typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=\
+                typical_acceptance_sampler_posterior_alpha,
+            disable_logprobs=disable_logprobs
         )
 
     @staticmethod
@@ -993,16 +1120,26 @@ def _maybe_override_draft_max_model_len(
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: Optional[int]
+    ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+        if speculative_draft_tensor_parallel_size is None:
+            speculative_draft_tensor_parallel_size = \
+                  target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size != 1:
+            # TODO(wooyeon): allow tp values larger than 1
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"other value than 1")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.
@@ -1025,6 +1162,10 @@ def __init__(
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
+        disable_logprobs: bool,
     ):
         """Create a SpeculativeConfig object.
 
@@ -1038,6 +1179,25 @@ def __init__(
                 enqueue requests is larger than this value.
             ngram_prompt_lookup_max: Max size of ngram token window.
             ngram_prompt_lookup_min: Min size of ngram token window.
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
+            disable_logprobs: If set to True, token log probabilities will not
+                be returned even if requested by sampling parameters. This 
+                reduces latency by skipping logprob calculation in proposal
+                sampling, target sampling, and after accepted tokens are
+                determined. If set to False, log probabilities will be
+                returned.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
@@ -1046,6 +1206,12 @@ def __init__(
             speculative_disable_by_batch_size
         self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
+        self.draft_token_acceptance_method = draft_token_acceptance_method
+        self.typical_acceptance_sampler_posterior_threshold = \
+            typical_acceptance_sampler_posterior_threshold
+        self.typical_acceptance_sampler_posterior_alpha = \
+            typical_acceptance_sampler_posterior_alpha
+        self.disable_logprobs = disable_logprobs
 
         self._verify_args()
 
@@ -1057,6 +1223,31 @@ def _verify_args(self) -> None:
         if self.draft_model_config:
             self.draft_model_config.verify_with_parallel_config(
                 self.draft_parallel_config)
+            # Validate and set draft token acceptance related settings.
+
+        if (self.draft_token_acceptance_method is None):
+            raise ValueError("draft_token_acceptance_method is not set. "
+                             "Expected values are rejection_sampler or "
+                             "typical_acceptance_sampler.")
+
+        if (self.draft_token_acceptance_method != 'rejection_sampler'
+                and self.draft_token_acceptance_method !=
+                'typical_acceptance_sampler'):
+            raise ValueError(
+                "Expected draft_token_acceptance_method to be either "
+                "rejection_sampler or typical_acceptance_sampler. Instead it "
+                f"is {self.draft_token_acceptance_method}")
+
+        if (self.typical_acceptance_sampler_posterior_threshold < 0
+                or self.typical_acceptance_sampler_posterior_alpha < 0):
+            raise ValueError(
+                "Expected typical_acceptance_sampler_posterior_threshold "
+                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
+                "Instead found "
+                f"typical_acceptance_sampler_posterior_threshold = "
+                f"{self.typical_acceptance_sampler_posterior_threshold} and "
+                f"typical_acceptance_sampler_posterior_alpha = "
+                f"{self.typical_acceptance_sampler_posterior_alpha}")
 
     @property
     def num_lookahead_slots(self) -> int:
@@ -1133,76 +1324,44 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
 
 
 @dataclass
-class VisionLanguageConfig:
-    """Configs the input data format and how models should run for
-    vision language models."""
-
-    class ImageInputType(enum.Enum):
-        """Image input type into the vision language model.
-
-        An image roughly goes through the following transformation:
-        Raw image --> pixel values --> image features --> image embeddings.
+class PromptAdapterConfig:
+    max_prompt_adapters: int
+    max_prompt_adapter_token: int
+    max_cpu_prompt_adapters: Optional[int] = None
+    prompt_adapter_dtype: Optional[torch.dtype] = None
 
-        The difference between different image input types is where the
-        image encoder (pixel values --> image features) is run.
-        Different image input types also correspond to different tensor shapes.
-
-        For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
-        IMAGE_FEATURES: (1, 576, 1024).
-        """
-        PIXEL_VALUES = enum.auto()
-        IMAGE_FEATURES = enum.auto()
-
-    image_input_type: ImageInputType
-    # The input id corresponding to image token.
-    image_token_id: int
-    # Used for running `run_prefill_max_token`.
-    # For models that support varying resolution, this corresponds to
-    # worst case scenario (biggest supported resolution).
-    image_input_shape: tuple
-    image_feature_size: int
-    # The image processor to load from HuggingFace
-    image_processor: Optional[str]
-    image_processor_revision: Optional[str]
-
-    @classmethod
-    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
-        """Get the image input type from a string."""
+    def __post_init__(self):
+        library_name = 'peft'
         try:
-            return cls.ImageInputType[value.upper()]
-        except KeyError as e:
-            raise ValueError(f"{value} is not a valid choice. "
-                             f"Expecting to choose from "
-                             f"{[x.name for x in cls.ImageInputType]}.") from e
-
-    #TODO(ywang96): make this a cached property once we refactor the
-    # VisionLanguageConfig class.
-    def get_image_token_text(
-            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
-        """Get the image token placeholder text to be inserted into the 
-        text prompt and the string representation of the image token id.
-        """
-        image_token_str = tokenizer.decode(self.image_token_id)
-        return image_token_str * self.image_feature_size, image_token_str
-
-    def as_cli_args_dict(self) -> Dict[str, Any]:
-        """Flatten vision language config to pure args.
+            __import__(library_name)
+        except ImportError as e:
+            raise ImportError(
+                f"'{library_name}' is not installed for prompt adapter support."
+                f"Please install it using 'pip install {library_name}'."
+            ) from e
+
+        if self.max_prompt_adapters < 1:
+            raise ValueError(f"max_prompt_adapters "
+                             f"({self.max_prompt_adapters}) must be >= 1.")
+        if self.max_prompt_adapter_token == 0:
+            raise ValueError("max_prompt_adapter_token must be set.")
+        if self.max_cpu_prompt_adapters is None:
+            self.max_cpu_prompt_adapters = self.max_prompt_adapters
 
-        Compatible with what llm entrypoint expects.
-        """
-        result: Dict[str, Any] = {}
-        for f in fields(self):
-            value = getattr(self, f.name)
-            if isinstance(value, enum.Enum):
-                result[f.name] = value.name.lower()
-            elif isinstance(value, tuple):
-                result[f.name] = ",".join([str(item) for item in value])
-            else:
-                result[f.name] = value
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.prompt_adapter_dtype in (None, "auto"):
+            self.prompt_adapter_dtype = model_config.dtype
+        elif isinstance(self.prompt_adapter_dtype, str):
+            self.prompt_adapter_dtype = getattr(torch,
+                                                self.prompt_adapter_dtype)
 
-        result["disable_image_processor"] = self.image_processor is None
 
-        return result
+@dataclass
+class MultiModalConfig:
+    """Configs the input data format and how models should run for
+    multimodal models."""
+    # TODO: Add configs to init vision tower or not.
+    pass
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
@@ -1230,10 +1389,16 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                # Following the common practice, we use float16 for float32
-                # models.
-                logger.info("Casting torch.float32 to torch.float16.")
-                torch_dtype = torch.float16
+                if config.model_type == "gemma2":
+                    logger.info(
+                        "For Gemma 2, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16.")
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
         else:
@@ -1318,23 +1483,32 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    # The correct one should be "longrope", kept "su" here
-    # to be backward compatible
-    if rope_scaling is not None and rope_scaling["type"] != "su" \
-        and rope_scaling["type"] != "longrope":
-        if disable_sliding_window:
-            # TODO(robertgshaw): Find a model that supports rope_scaling
-            # with sliding window to see if this case should be allowed.
-            raise NotImplementedError(
-                "Disabling sliding window is not supported for models "
-                "with rope_scaling. Please raise an issue so we can "
-                "investigate.")
-        assert "factor" in rope_scaling
-        scaling_factor = rope_scaling["factor"]
-        if rope_scaling["type"] == "yarn":
-            derived_max_model_len = rope_scaling[
-                "original_max_position_embeddings"]
-        derived_max_model_len *= scaling_factor
+    if rope_scaling is not None:
+        if "type" in rope_scaling:
+            rope_type = rope_scaling["type"]
+        elif "rope_type" in rope_scaling:
+            rope_type = rope_scaling["rope_type"]
+        else:
+            raise ValueError(
+                "rope_scaling must have a 'type' or 'rope_type' key.")
+
+        # The correct one should be "longrope", kept "su" here
+        # to be backward compatible
+        if rope_type not in ("su", "longrope", "llama3"):
+            if disable_sliding_window:
+                # TODO(robertgshaw): Find a model that supports rope_scaling
+                # with sliding window to see if this case should be allowed.
+                raise NotImplementedError(
+                    "Disabling sliding window is not supported for models "
+                    "with rope_scaling. Please raise an issue so we can "
+                    "investigate.")
+
+            assert "factor" in rope_scaling
+            scaling_factor = rope_scaling["factor"]
+            if rope_type == "yarn":
+                derived_max_model_len = rope_scaling[
+                    "original_max_position_embeddings"]
+            derived_max_model_len *= scaling_factor
 
     # If the user specified a max length, make sure it is smaller than the
     # derived length from the HF model config.
@@ -1420,10 +1594,11 @@ class EngineConfig:
     device_config: DeviceConfig
     load_config: LoadConfig
     lora_config: Optional[LoRAConfig]
-    vision_language_config: Optional[VisionLanguageConfig]
+    multimodal_config: Optional[MultiModalConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
     observability_config: Optional[ObservabilityConfig]
+    prompt_adapter_config: Optional[PromptAdapterConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1435,6 +1610,9 @@ def __post_init__(self):
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
+        if self.prompt_adapter_config:
+            self.prompt_adapter_config.verify_with_model_config(
+                self.model_config)
 
     def to_dict(self):
         """Return the configs as a dictionary, for use in **kwargs.
diff --git a/vllm/connections.py b/vllm/connections.py
new file mode 100644
index 0000000000000..65d44176e2464
--- /dev/null
+++ b/vllm/connections.py
@@ -0,0 +1,167 @@
+from pathlib import Path
+from typing import Mapping, Optional
+from urllib.parse import urlparse
+
+import aiohttp
+import requests
+
+from vllm.version import __version__ as VLLM_VERSION
+
+
+class HTTPConnection:
+    """Helper class to send HTTP requests."""
+
+    def __init__(self, *, reuse_client: bool = True) -> None:
+        super().__init__()
+
+        self.reuse_client = reuse_client
+
+        self._sync_client: Optional[requests.Session] = None
+        self._async_client: Optional[aiohttp.ClientSession] = None
+
+    def get_sync_client(self) -> requests.Session:
+        if self._sync_client is None or not self.reuse_client:
+            self._sync_client = requests.Session()
+
+        return self._sync_client
+
+    # NOTE: We intentionally use an async function even though it is not
+    # required, so that the client is only accessible inside async event loop
+    async def get_async_client(self) -> aiohttp.ClientSession:
+        if self._async_client is None or not self.reuse_client:
+            self._async_client = aiohttp.ClientSession()
+
+        return self._async_client
+
+    def _validate_http_url(self, url: str):
+        parsed_url = urlparse(url)
+
+        if parsed_url.scheme not in ("http", "https"):
+            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
+                             "must have scheme 'http' or 'https'.")
+
+    def _headers(self, **extras: str) -> Mapping[str, str]:
+        return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
+
+    def get_response(
+        self,
+        url: str,
+        *,
+        stream: bool = False,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+
+        client = self.get_sync_client()
+        extra_headers = extra_headers or {}
+
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          stream=stream,
+                          timeout=timeout)
+
+    async def get_async_response(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+
+        client = await self.get_async_client()
+        extra_headers = extra_headers or {}
+
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          timeout=timeout)
+
+    def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.content
+
+    async def async_get_bytes(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> bytes:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.read()
+
+    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.text
+
+    async def async_get_text(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.text()
+
+    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return r.json()
+
+    async def async_get_json(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            return await r.json()
+
+    def download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            with save_path.open("wb") as f:
+                for chunk in r.iter_content(chunk_size):
+                    f.write(chunk)
+
+        return save_path
+
+    async def async_download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+
+            with save_path.open("wb") as f:
+                async for chunk in r.content.iter_chunked(chunk_size):
+                    f.write(chunk)
+
+        return save_path
+
+
+global_http_connection = HTTPConnection()
+"""The global :class:`HTTPConnection` instance used by vLLM."""
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d705f3d91a074..06b816eb367f5 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,5 +1,7 @@
+import math
 from typing import List, Optional
 
+from vllm.core.block.common import BlockList
 from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
 from vllm.utils import Device, cdiv, chunk_list
 
@@ -47,12 +49,10 @@ def __init__(
         self._allocator = block_allocator
         if _blocks is None:
             _blocks = []
-        self._blocks: List[Block] = _blocks
+        self._blocks: BlockList = BlockList(_blocks)
 
         self._max_block_sliding_window = max_block_sliding_window
-        # Use helper method instead of directly calculating, as blocks
-        # may not be allocated.
-        self._num_full_slots = len(self._get_all_token_ids())
+        self._num_full_slots = self._get_num_token_ids()
 
     @staticmethod
     def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
@@ -88,11 +88,18 @@ def allocate(self,
         """
         assert not self._is_allocated
         assert token_ids
-        self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                           token_ids=token_ids,
-                                                           device=device)
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device)
+        self.update(blocks)
         self._num_full_slots = len(token_ids)
 
+    def update(self, blocks: List[Block]) -> None:
+        """Resets the table to the newly provided blocks 
+        (with their corresponding block ids)
+        """
+        self._blocks.update(blocks)
+
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
@@ -140,11 +147,11 @@ def append_token_ids(self,
                                     num_lookahead_slots)
 
         # Update the blocks with the new tokens
-        blocks = self._blocks[self._num_full_slots // self._block_size:]
+        first_block_idx = self._num_full_slots // self._block_size
         token_blocks = self._chunk_token_blocks_for_append(token_ids)
 
-        for block, token_block in zip(blocks, token_blocks):
-            block.append_token_ids(token_block)
+        for i, token_block in enumerate(token_blocks):
+            self._blocks.append_token_ids(first_block_idx + i, token_block)
 
         self._num_full_slots += len(token_ids)
 
@@ -174,8 +181,8 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
         for _ in range(blocks_to_allocate):
             assert len(self._blocks) > 0
             self._blocks.append(
-                self._allocator.allocate_mutable(prev_block=self._blocks[-1],
-                                                 device=device))
+                self._allocator.allocate_mutable_block(
+                    prev_block=self._blocks[-1], device=device))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -209,12 +216,12 @@ def free(self) -> None:
         is set to `None`.
         """
         assert self._is_allocated
-        for block in self._blocks:
+        for block in self.blocks:
             self._allocator.free(block)
-        self._blocks = []
+        self._blocks.reset()
 
     @property
-    def physical_block_ids(self) -> List[Optional[int]]:
+    def physical_block_ids(self) -> List[int]:
         """Returns a list of physical block indices for the blocks in the
         BlockTable.
 
@@ -228,7 +235,7 @@ def physical_block_ids(self) -> List[Optional[int]]:
                 BlockTable.
         """
         assert self._is_allocated
-        return [block.block_id for block in self._blocks]
+        return self._blocks.ids()
 
     def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         """Get the number of "unseen" tokens in the sequence.
@@ -253,17 +260,31 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
                                        token_ids: List[int],
                                        device: Device) -> List[Block]:
         blocks: List[Block] = []
-        for block_token_ids in chunk_list(token_ids, self._block_size):
-            if len(block_token_ids) == self._block_size:
-                # If the block is full, create an immutable block.
-                prev_block = self._allocator.allocate_immutable(
-                    prev_block, token_ids=block_token_ids, device=device)
+
+        block_token_ids = []
+        tail_token_ids = []
+        for cur_token_ids in chunk_list(token_ids, self._block_size):
+            if len(cur_token_ids) == self._block_size:
+                block_token_ids.append(cur_token_ids)
             else:
-                # Else, partially fill a mutable block with token ids.
-                prev_block = self._allocator.allocate_mutable(
-                    prev_block=prev_block, device=device)
-                prev_block.append_token_ids(block_token_ids)
-            blocks.append(prev_block)
+                tail_token_ids.append(cur_token_ids)
+
+        if block_token_ids:
+            blocks.extend(
+                self._allocator.allocate_immutable_blocks(
+                    prev_block, block_token_ids=block_token_ids,
+                    device=device))
+            prev_block = blocks[-1]
+
+        if tail_token_ids:
+            assert len(tail_token_ids) == 1
+            cur_token_ids = tail_token_ids[0]
+
+            block = self._allocator.allocate_mutable_block(
+                prev_block=prev_block, device=device)
+            block.append_token_ids(cur_token_ids)
+
+            blocks.append(block)
 
         return blocks
 
@@ -274,18 +295,25 @@ def _get_all_token_ids(self) -> List[int]:
         if not self._is_allocated:
             return token_ids
 
-        for block in self._blocks:
+        for block in self.blocks:
             token_ids.extend(block.token_ids)
 
         return token_ids
 
+    def _get_num_token_ids(self) -> int:
+        res = 0
+        for block in self.blocks:
+            res += len(block.token_ids)
+
+        return res
+
     @property
     def _is_allocated(self) -> bool:
         return len(self._blocks) > 0
 
     @property
-    def blocks(self) -> Optional[List[Block]]:
-        return self._blocks
+    def blocks(self) -> List[Block]:
+        return self._blocks.list()
 
     @property
     def _num_empty_slots(self) -> int:
@@ -310,10 +338,17 @@ def get_num_blocks_touched_by_append_slots(
         This is required for the scheduler to determine whether a sequence can
         continue generation, or if it must be preempted.
         """
+        # Math below is equivalent to:
+        # all_token_ids = token_ids + [-1] * num_lookahead_slots
+        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+        # return len(token_blocks)
 
-        all_token_ids = token_ids + [-1] * num_lookahead_slots
-        token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
-        return len(token_blocks)
+        num_token_ids = len(token_ids) + num_lookahead_slots
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        num_token_blocks = (1 + math.ceil(
+            (num_token_ids - first_chunk_size) / self._block_size))
+        return num_token_blocks
 
     def _chunk_token_blocks_for_append(
             self, token_ids: List[int]) -> List[List[int]]:
@@ -324,6 +359,7 @@ def _chunk_token_blocks_for_append(
         """
         first_chunk_size = self._block_size - (self._num_full_slots %
                                                self._block_size)
-        token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
-            token_ids[first_chunk_size:], self._block_size)
+        token_blocks = [token_ids[:first_chunk_size]]
+        token_blocks.extend(
+            chunk_list(token_ids[first_chunk_size:], self._block_size))
         return token_blocks
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index d2787d69616f0..1e808e21b72e5 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,4 +1,5 @@
-from typing import Dict, Iterable, List, Optional, Protocol, Tuple
+from collections import deque
+from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
 
 from vllm.core.block.interfaces import Block, BlockAllocator
 
@@ -95,64 +96,40 @@ class CopyOnWriteTracker:
 
     The CopyOnWriteTracker class maintains a mapping of source block indices to
         their corresponding copy-on-write destination block indices. It works in
-        conjunction with a RefCounter and a BlockAllocator to handle reference
-        counting and block allocation.
+        conjunction with a RefCounter.
 
     Args:
         refcounter (RefCounter): The reference counter used to track block
             reference counts.
-        allocator (BlockAllocator): The block allocator used to allocate and
-            free blocks.
     """
 
-    def __init__(
-        self,
-        refcounter: RefCounterProtocol,
-        allocator: BlockAllocator,
-    ):
+    def __init__(self, refcounter: RefCounterProtocol):
         self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
         self._refcounter = refcounter
-        self._allocator = allocator
-
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
-        """Performs a copy-on-write operation on the given block if it is not
-        appendable.
-
-        This method checks the reference count of the given block. If the
-        reference count is greater than 1, indicating that the block is shared,
-        a copy-on-write operation is performed. The original block is freed,
-        and a new block is allocated with the same content. The new block index
-        is returned.
-
-        Args:
-            block (Block): The block to check for copy-on-write.
 
-        Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
-                no copy-on-write was necessary.
+    def is_appendable(self, block: Block) -> bool:
+        """Checks if the block is shared or not. If shared, then it cannot
+        be appended and needs to be duplicated via copy-on-write
         """
         block_id = block.block_id
         if block_id is None:
-            return block_id
+            return True
 
         refcount = self._refcounter.get(block_id)
-        assert refcount != 0
-        if refcount > 1:
-            src_block_id = block_id
-            # Decrement refcount of the old block.
-            self._allocator.free(block)
-
-            # Allocate a fresh new block.
-            block_id = self._allocator.allocate_mutable(
-                prev_block=block.prev_block).block_id
+        return refcount <= 1
 
-            # Track src/dst copy.
-            assert src_block_id is not None
-            assert block_id is not None
-            self._copy_on_writes.append((src_block_id, block_id))
-
-        return block_id
+    def record_cow(self, src_block_id: Optional[BlockId],
+                   trg_block_id: Optional[BlockId]) -> None:
+        """Records a copy-on-write operation from source to target block id
+        Args:
+            src_block_id (BlockId): The source block id from which to copy 
+                the data
+            trg_block_id (BlockId): The target block id to which the data
+                is copied
+        """
+        assert src_block_id is not None
+        assert trg_block_id is not None
+        self._copy_on_writes.append((src_block_id, trg_block_id))
 
     def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
         """Clears the copy-on-write tracking information and returns the current
@@ -172,6 +149,139 @@ def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
         return cows
 
 
+class BlockPool:
+    """Used to pre-allocate block objects, in order to avoid excessive python
+    object allocations/deallocations.
+    The pool starts from "pool_size" objects and will increase to more objects
+    if necessary
+
+    Note that multiple block objects may point to the same physical block id,
+    which is why this pool is needed, so that it will be easier to support
+    prefix caching and more complicated sharing of physical blocks.
+    """
+
+    def __init__(self, block_size: int, create_block: Block.Factory,
+                 allocator: BlockAllocator, pool_size: int):
+        self._block_size = block_size
+        self._create_block = create_block
+        self._allocator = allocator
+        self._pool_size = pool_size
+        assert self._pool_size >= 0
+
+        self._free_ids: Deque[int] = deque(range(self._pool_size))
+        self._pool = []
+        for i in range(self._pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None))
+
+    def increase_pool(self):
+        """Doubles the internal pool size
+        """
+        cur_pool_size = self._pool_size
+        new_pool_size = cur_pool_size * 2
+        self._pool_size = new_pool_size
+
+        self._free_ids += deque(range(cur_pool_size, new_pool_size))
+
+        for i in range(cur_pool_size, new_pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None))
+
+    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
+                   block_size: int, physical_block_id: Optional[int]) -> Block:
+        if len(self._free_ids) == 0:
+            self.increase_pool()
+            assert len(self._free_ids) > 0
+
+        pool_id = self._free_ids.popleft()
+
+        block = self._pool[pool_id]
+        block.__init__(  # type: ignore[misc]
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=block._allocator,  # type: ignore[attr-defined] 
+            block_id=physical_block_id)
+        block.pool_id = pool_id  # type: ignore[attr-defined]
+        return block
+
+    def free_block(self, block: Block) -> None:
+        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
+
+
+class BlockList:
+    """This class is an optimization to allow fast-access to physical 
+    block ids. It maintains a block id list that is updated with the 
+    block list and this avoids the need to reconstruct the block id 
+    list on every iteration of the block manager
+    """
+
+    def __init__(self, blocks: List[Block]):
+        self._blocks: List[Block] = []
+        self._block_ids: List[int] = []
+
+        self.update(blocks)
+
+    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_ids.append(block_id)
+
+    def _update_block_id(self, block_index: int,
+                         new_block_id: Optional[BlockId]) -> None:
+        assert new_block_id is not None
+        self._block_ids[block_index] = new_block_id
+
+    def update(self, blocks: List[Block]):
+        self._blocks = blocks
+
+        # Cache block ids for fast query
+        self._block_ids = []
+        for block in self._blocks:
+            self._add_block_id(block.block_id)
+
+    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
+        block = self._blocks[block_index]
+        prev_block_id = block.block_id
+
+        block.append_token_ids(token_ids)
+
+        # CoW or promotion may update the internal block_id
+        if prev_block_id != block.block_id:
+            self._update_block_id(block_index, block.block_id)
+
+    def append(self, new_block: Block):
+        self._blocks.append(new_block)
+        self._add_block_id(new_block.block_id)
+
+    def __len__(self) -> int:
+        return len(self._blocks)
+
+    def __getitem__(self, block_index: int) -> Block:
+        return self._blocks[block_index]
+
+    def __setitem__(self, block_index: int, new_block: Block) -> None:
+        self._blocks[block_index] = new_block
+        self._update_block_id(block_index, new_block.block_id)
+
+    def reset(self):
+        self._blocks = []
+        self._block_ids = []
+
+    def list(self) -> List[Block]:
+        return self._blocks
+
+    def ids(self) -> List[int]:
+        return self._block_ids
+
+
 def get_all_blocks_recursively(last_block: Block) -> List[Block]:
     """Retrieves all the blocks in a sequence starting from the last block.
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 255aae9d17318..5287cd9c1bfb3 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -113,11 +113,11 @@ def __init__(self, cpu_block_allocator: BlockAllocator,
     def allocate_or_get_null_block(self) -> Block:
         if self._null_block is None:
             self._null_block = NullBlock(
-                self.allocate_mutable(None, Device.GPU))
+                self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               device: Device) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
@@ -128,10 +128,31 @@ def allocate_mutable(self, prev_block: Optional[Block],
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable(prev_block)
+        return self._allocators[device].allocate_mutable_block(prev_block)
 
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  device: Optional[Device]) -> List[Block]:
+        """Allocates a new group of immutable blocks with the provided block 
+        token IDs on the specified device.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            block_token_ids (List[int]): The list of block token IDs to be 
+                stored in the new blocks.
+            device (Device): The device on which to allocate the new block.
+
+        Returns:
+            List[Block]: The newly allocated list of immutable blocks 
+                containing the provided block token IDs.
+        """
+        return self._allocators[device].allocate_immutable_blocks(
+            prev_block, block_token_ids)
+
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -146,7 +167,7 @@ def allocate_immutable(self, prev_block: Optional[Block],
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
-        return self._allocators[device].allocate_immutable(
+        return self._allocators[device].allocate_immutable_block(
             prev_block, token_ids)
 
     def free(self, block: Block) -> None:
@@ -161,7 +182,7 @@ def free(self, block: Block) -> None:
         block_id = block.block_id
         assert block_id is not None
         allocator = self._block_ids_to_allocator[block_id]
-        return allocator.free(block)
+        allocator.free(block)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
@@ -210,8 +231,8 @@ def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
         """
         return self._allocators[device].get_physical_block_id(absolute_id)
 
-    def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> Dict[int, int]:
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
         """Execute the swap for the given blocks from source_device
         on to dest_device, save the current swap mapping and append 
         them to the accumulated `self._swap_mapping` for each 
@@ -219,23 +240,23 @@ def swap(self, blocks: List[Block], source_device: Device,
 
         Args:
             blocks: List of blocks to be swapped.
-            source_device (Device): Device to swap the 'blocks' from.
-            dest_device (Device): Device to swap the 'blocks' to.
+            src_device (Device): Device to swap the 'blocks' from.
+            dst_device (Device): Device to swap the 'blocks' to.
         
         Returns:
             Dict[int, int]: Swap mapping from source_device
                 on to dest_device.
         """
-        source_block_ids = [block.block_id for block in blocks]
-        self._allocators[source_device].swap_out(blocks)
-        self._allocators[dest_device].swap_in(blocks)
-        dest_block_ids = [block.block_id for block in blocks]
+        src_block_ids = [block.block_id for block in blocks]
+        self._allocators[src_device].swap_out(blocks)
+        self._allocators[dst_device].swap_in(blocks)
+        dst_block_ids = [block.block_id for block in blocks]
 
         current_swap_mapping: Dict[int, int] = {}
-        for src, dest in zip(source_block_ids, dest_block_ids):
-            if src is not None and dest is not None:
-                self._swap_mapping[src] = dest
-                current_swap_mapping[src] = dest
+        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
+            if src_block_id is not None and dst_block_id is not None:
+                self._swap_mapping[src_block_id] = dst_block_id
+                current_swap_mapping[src_block_id] = dst_block_id
         return current_swap_mapping
 
     def get_num_blocks_touched(self,
@@ -283,23 +304,25 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         device = Device.GPU
         return self._allocators[device].mark_blocks_as_computed(block_ids)
 
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].get_computed_block_ids(
+            prev_computed_block_ids, block_ids, skip_last_block_id)
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         # Prefix caching only supported on GPU.
         device = Device.GPU
         return self._allocators[device].get_common_computed_block_ids(
-            seq_block_ids)
+            computed_seq_block_ids)
 
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError
-
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
-        raise NotImplementedError
-
     def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
@@ -341,6 +364,11 @@ def block_id(self, value: Optional[BlockId]):
     def token_ids(self) -> List[BlockId]:
         return self._proxy.token_ids
 
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for null block")
+
     @property
     def num_empty_slots(self) -> BlockId:
         return self._proxy.num_empty_slots
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 4b20856a1b42d..ab39832bc1f6e 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -28,6 +28,13 @@ def block_id(self, value: Optional[int]) -> None:
     def token_ids(self) -> List[int]:
         pass
 
+    @property
+    @abstractmethod
+    def num_tokens_total(self) -> int:
+        """The number of tokens till the current block (inclusive)
+        """
+        pass
+
     @property
     @abstractmethod
     def num_empty_slots(self) -> int:
@@ -92,12 +99,18 @@ def content_hash(self) -> Optional[int]:
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int]) -> Block:
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int]) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_blocks(
+            self, prev_block: Optional[Block],
+            block_token_ids: List[List[int]]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -146,13 +159,19 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
+    @abstractmethod
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        pass
+
     @abstractmethod
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
     @abstractmethod
-    def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """NOTE: This should not be used besides Block"""
         pass
 
@@ -174,13 +193,20 @@ class NoFreeBlocksError(ValueError):
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               device: Device) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  device: Device) -> List[Block]:
         pass
 
     @abstractmethod
@@ -217,9 +243,15 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
+    @abstractmethod
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        pass
+
     @abstractmethod
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
     @abstractmethod
@@ -230,8 +262,8 @@ def get_num_blocks_touched(self,
         pass
 
     @abstractmethod
-    def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> Dict[int, int]:
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
         pass
 
     @abstractmethod
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 50f27bab33776..0c1e883141716 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,6 +1,7 @@
-from typing import FrozenSet, Iterable, List, Optional, Set, Tuple
+from collections import deque
+from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
 
-from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
+from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.utils import cdiv
@@ -31,28 +32,39 @@ def __init__(
         num_blocks: int,
         block_size: int,
         block_ids: Optional[Iterable[int]] = None,
+        block_pool: Optional[BlockPool] = None,
     ):
         if block_ids is None:
             block_ids = range(num_blocks)
 
-        self._free_block_indices: Set[BlockId] = set(block_ids)
+        self._free_block_indices: Deque[BlockId] = deque(block_ids)
         self._all_block_indices = frozenset(block_ids)
         assert len(self._all_block_indices) == num_blocks
 
         self._refcounter = RefCounter(
             all_block_indices=self._free_block_indices)
-        self._create_block = create_block
         self._block_size = block_size
 
         self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly(),
-            allocator=self,
-        )
-
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+            refcounter=self._refcounter.as_readonly())
+
+        if block_pool is None:
+            extra_factor = 4
+            # Pre-allocate "num_blocks * extra_factor" block objects.
+            # The "* extra_factor" is a buffer to allow more block objects
+            # than physical blocks
+            self._block_pool = BlockPool(self._block_size, create_block, self,
+                                         num_blocks * extra_factor)
+        else:
+            # In this case, the block pool is provided by the caller,
+            # which means that there is most likely a need to share
+            # a block pool between allocators
+            self._block_pool = block_pool
+
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
 
@@ -66,13 +78,36 @@ def allocate_immutable(self,
             Block: The newly allocated immutable block.
         """
         assert device is None
-        block = self.allocate_mutable(prev_block=prev_block)
+        block = self.allocate_mutable_block(prev_block=prev_block)
         block.append_token_ids(token_ids)
         return block
 
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Optional[Device] = None) -> List[Block]:
+        assert device is None
+        num_blocks = len(block_token_ids)
+
+        block_ids = []
+        for i in range(num_blocks):
+            block_ids.append(self._allocate_block_id())
+
+        blocks = []
+        for i in range(num_blocks):
+            prev_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block_token_ids[i],
+                block_size=self._block_size,
+                physical_block_id=block_ids[i])
+            blocks.append(prev_block)
+
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
         Args:
@@ -84,20 +119,39 @@ def allocate_mutable(self,
             Block: The newly allocated mutable block.
         """
         assert device is None
-        block_id = self._allocate_new_block_id()
-        return self._create_block(
-            prev_block=prev_block,
-            token_ids=[],
-            block_id=block_id,
-            block_size=self._block_size,
-            allocator=self,
-        )
-
-    def free(self, block: Block) -> None:
-        assert block.block_id is not None
-        self._free_block_id(block.block_id)
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        return block
+
+    def _allocate_block_id(self) -> BlockId:
+        if not self._free_block_indices:
+            raise BlockAllocator.NoFreeBlocksError()
+
+        block_id = self._free_block_indices.popleft()
+        self._refcounter.incr(block_id)
+        return block_id
+
+    def _free_block_id(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount == 0:
+            self._free_block_indices.appendleft(block_id)
+
         block.block_id = None
 
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        # Release the physical block id
+        self._free_block_id(block)
+
+        # Release the block object
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
@@ -120,14 +174,13 @@ def fork(self, last_block: Block) -> List[Block]:
             refcount = self._refcounter.incr(block.block_id)
             assert refcount != 1, "can't fork free'd block"
 
-            forked_blocks.append(
-                self._create_block(
-                    prev_block=prev_block,
-                    token_ids=block.token_ids,
-                    block_id=block.block_id,
-                    block_size=self._block_size,
-                    allocator=self,
-                ))
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block.block_id)
+
+            forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
 
         return forked_blocks
@@ -138,20 +191,6 @@ def get_num_free_blocks(self) -> int:
     def get_num_total_blocks(self) -> int:
         return len(self._all_block_indices)
 
-    def _allocate_new_block_id(self) -> BlockId:
-        if not self._free_block_indices:
-            raise BlockAllocator.NoFreeBlocksError()
-
-        block_id = next(iter(self._free_block_indices))
-        self._refcounter.incr(block_id)
-        self._free_block_indices.remove(block_id)
-        return block_id
-
-    def _free_block_id(self, block_id: BlockId) -> None:
-        refcount = self._refcounter.decr(block_id)
-        if refcount == 0:
-            self._free_block_indices.add(block_id)
-
     def get_physical_block_id(self, absolute_id: int) -> int:
         """Returns the zero-offset block id on certain block allocator
         given the absolute block id.
@@ -173,7 +212,7 @@ def refcounter(self):
     def all_block_ids(self) -> FrozenSet[int]:
         return self._all_block_indices
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """Performs a copy-on-write operation on the given block if it is not
         appendable.
 
@@ -181,11 +220,22 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
             block (Block): The block to check for copy-on-write.
 
         Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
                 no copy-on-write was necessary.
         """
-        return self._cow_tracker.cow_block_if_not_appendable(block)
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
 
     def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
         """Returns the copy-on-write source->destination mapping and clears it.
@@ -213,8 +263,15 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         """
         pass
 
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        """No prefix caching here => return empty list
+        """
+        return []
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Determine blocks that can be skipped in prefill.
 
         Since the naive allocator does not support prefix caching, always return
@@ -223,7 +280,7 @@ def get_common_computed_block_ids(
         return []
 
     def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError
+        raise NotImplementedError("There is no promotion for naive blocks")
 
     def get_num_blocks_touched(self,
                                blocks: List[Block],
@@ -263,17 +320,27 @@ def get_num_blocks_touched(self,
 
     def swap_out(self, blocks: List[Block]) -> None:
         for block in blocks:
-            self.free(block)
+            self._free_block_id(block)
 
     def swap_in(self, blocks: List[Block]) -> None:
         for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
             if block.is_full:
-                alloc = self.allocate_immutable(block.prev_block,
-                                                block.token_ids)
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
             else:
-                alloc = self.allocate_mutable(block.prev_block)
-                alloc.append_token_ids(block.token_ids)
-            block.block_id = alloc.block_id
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            tmp_block.block_id = None
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
 
 
 class NaiveBlock(Block):
@@ -315,11 +382,12 @@ def __init__(self,
         self._append_token_ids_no_cow(token_ids)
 
     def append_token_ids(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block, instructing the allocator
-        to perform a copy-on-write if necessary.
+        """Appends the given token IDs to the block and performs a 
+        copy-on-write if necessary.
 
         Args:
-            token_ids (List[int]): The token IDs to be appended to the block.
+            token_ids (Optional[List[int]]): The token IDs to be appended 
+                to the block.
         """
         self._append_token_ids_no_cow(token_ids)
 
@@ -328,7 +396,16 @@ def append_token_ids(self, token_ids: List[int]) -> None:
                 self._cow_target))
 
     def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
-        assert self.num_empty_slots >= len(token_ids)
+        """Appends the given token IDs to the block
+
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        if len(token_ids) == 0:
+            return
+
+        assert len(token_ids) <= self.num_empty_slots
+
         self._token_ids.extend(token_ids)
 
     @property
@@ -361,12 +438,17 @@ def is_full(self) -> bool:
 
     @property
     def num_empty_slots(self) -> int:
-        return self._block_size - len(self._token_ids)
+        return self._block_size - len(self.token_ids)
 
     @property
     def token_ids(self) -> List[int]:
         return self._token_ids
 
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for naive block")
+
     @property
     def block_size(self) -> int:
         return self._block_size
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2df7d74e4ff19..d102ad4045591 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,13 +1,13 @@
 """Token blocks."""
 
-from itertools import takewhile
 from os.path import commonprefix
 from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
 
 from vllm.core.block.common import (CopyOnWriteTracker,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
+                                         NaiveBlockAllocator)
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 from vllm.utils import cdiv
 
@@ -19,6 +19,30 @@
 _DEFAULT_LAST_ACCESSED_TIME = -1
 
 
+class BlockTracker:
+    """Used to track the status of a block inside the prefix caching allocator
+    """
+    __slots__ = ("active", "last_accessed", "computed")
+
+    def reset(self):
+        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self.computed: bool = False
+
+    def __init__(self):
+        self.active: bool = False
+        self.reset()
+
+    def enable(self):
+        assert not self.active
+        self.active = True
+        self.reset()
+
+    def disable(self):
+        assert self.active
+        self.active = False
+        self.reset()
+
+
 class PrefixCachingBlockAllocator(BlockAllocator):
     """A block allocator that implements prefix caching.
 
@@ -41,12 +65,26 @@ def __init__(
         block_ids: Optional[Iterable[int]] = None,
         eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
     ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+
+        self._block_size = block_size
+
         # A mapping of prefix hash to block index. All blocks which have a
         # prefix hash will be in this dict, even if they have refcount 0.
         self._cached_blocks: Dict[PrefixHash, BlockId] = {}
 
-        # A mapping of blockId to Block to track those cached blocks
-        self._blocks: Dict[BlockId, Block] = {}
+        # Used to track status of each physical block id
+        self._block_tracker: Dict[BlockId, BlockTracker] = {}
+        for block_id in block_ids:
+            self._block_tracker[block_id] = BlockTracker()
+
+        # Pre-allocate "num_blocks * extra_factor" block objects.
+        # The "* extra_factor" is a buffer to allow more block objects
+        # than physical blocks
+        extra_factor = 4
+        self._block_pool = BlockPool(self._block_size, self._create_block,
+                                     self, num_blocks * extra_factor)
 
         # An allocator for blocks that do not have prefix hashes.
         self._hashless_allocator = NaiveBlockAllocator(
@@ -54,10 +92,9 @@ def __init__(
             num_blocks=num_blocks,
             block_size=block_size,
             block_ids=block_ids,
+            block_pool=self._block_pool,  # Share block pool here
         )
 
-        self._block_size = block_size
-
         # Evitor used to maintain how we want to handle those computed blocks
         # if we find memory pressure is high.
         self.evictor: Evictor = make_evictor(eviction_policy)
@@ -68,9 +105,7 @@ def __init__(
         self._refcounter = self._hashless_allocator.refcounter
 
         self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly(),
-            allocator=self,
-        )
+            refcounter=self._refcounter.as_readonly())
 
     # Implements Block.Factory.
     def _create_block(
@@ -90,14 +125,14 @@ def _create_block(
             token_ids=token_ids,
             block_size=block_size,
             block_id=block_id,
-            prefix_caching_allocator=allocator,
+            allocator=allocator,
             computed=computed,
         )
 
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
 
@@ -111,29 +146,41 @@ def allocate_immutable(self,
         assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
-        block = self._create_block(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=self._block_size,
-            allocator=self,
-        )
+        # First, try to create a block that points to cached data
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=token_ids,
+                                            block_size=self._block_size,
+                                            physical_block_id=None)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
         if cached_block_id is not None:
             block.block_id = cached_block_id
-            self._incr_refcount_cached_block(block, block.block_id)
+            self._incr_refcount_cached_block(block)
             return block
+        self._block_pool.free_block(block)
 
-        block = self.allocate_mutable(prev_block)
+        # No cached block => Allocate a new block
+        block = self.allocate_mutable_block(prev_block)
         block.append_token_ids(token_ids)
-        assert block.content_hash is not None
-
         return block
 
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Optional[Device] = None) -> List[Block]:
+        blocks = []
+        for token_ids in block_token_ids:
+            prev_block = self.allocate_immutable_block(prev_block=prev_block,
+                                                       token_ids=token_ids,
+                                                       device=device)
+            blocks.append(prev_block)
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
 
@@ -147,116 +194,154 @@ def allocate_mutable(self,
         assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
-        try:
-            block = self._hashless_allocator.allocate_mutable(
-                prev_block=prev_block)
-
-            assert block.block_id not in self._blocks
-            assert block.block_id is not None
-            self._blocks[block.block_id] = block
-            return block
-        except BlockAllocator.NoFreeBlocksError:
-            # We must check the unused cached blocks before raising OOM.
-            pass
-
-        # If the evictor has blocks available for eviction, evict a block
-        # and return it.
-        if self.evictor.num_blocks > 0:
-            # here we get an evicted block, which is only added
-            # into evictor if its ref counter is 0
-            # and since its content would be changed, we need
-            # to remove it from _cached_blocks's tracking list
-            block_id, content_hash_to_evict = self.evictor.evict()
-
-            _block_id = self._cached_blocks[content_hash_to_evict]
-            assert self._refcounter.get(_block_id) == 0
-            assert _block_id == block_id
-
-            self._cached_blocks.pop(content_hash_to_evict)
-
-            self._refcounter.incr(block_id)
-
-            # Now this block is pop from evictor and ready to write
-            # with new content which most probably different with
-            # original content. So need to tell worker to recompute
-            # its kvcache
-            block = self._create_block(
-                prev_block=prev_block,
-                token_ids=[],
-                block_size=self._block_size,
-                allocator=self,
-                block_id=block_id,
-                computed=False,
-            )
-            assert block.content_hash is None
-
-            assert block.block_id not in self._blocks
-            assert block.block_id is not None
-            self._blocks[block.block_id] = block
-            return block
-
-        # No block available in hashless allocator, nor in unused cache blocks.
-        raise BlockAllocator.NoFreeBlocksError()
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        assert not block.computed
+        assert block.content_hash is None
+        return block
 
-    def _incr_refcount_cached_block(self, block: Block,
-                                    block_id: BlockId) -> None:
-        # now _incr_refcount_cached_block comes from two place
-        # allocate_immutable/promote_to_immutable_block where hit
-        # _cached_blocks hash key.
-        # In both cases, it means that already exists a already
-        # computed block which shared with block now
+    def _incr_refcount_cached_block(self, block: Block) -> None:
+        # Set this block to be "computed" since it is pointing to a
+        # cached block id (which was already computed)
         block.computed = True
 
+        block_id = block.block_id
+        assert block_id is not None
+
         refcount = self._refcounter.incr(block_id)
         if refcount == 1:
-            # if block get referred, then it shall not be in evictor
-            # and put it into _blocks for tracking
+            # In case a cached block was evicted, restore its tracking
             if block_id in self.evictor:
                 self.evictor.remove(block_id)
-            self._blocks[block_id] = block
 
-    def free(self, block: Block) -> None:
-        """Decrement the refcount of the block. If the decremented refcount is
-        zero, store the block in the freelist.
+            self._track_block_id(block_id, computed=True)
 
-        If the block has a content hash (meaning it is immutable), then we will
-        keep the block around in case future allocations require it.
-        """
-        assert (block.block_id
-                is not None), "freeing unallocated block is undefined"
+    def _decr_refcount_cached_block(self, block: Block) -> None:
+        # Ensure this is immutable/cached block
+        assert block.content_hash is not None
+
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount > 0:
+            block.block_id = None
+            return
+        else:
+            assert refcount == 0
 
-        self._free_block_id_for_block(block.block_id, block)
+        # No longer used
+        assert block.content_hash in self._cached_blocks
+
+        # Add the cached block to the evictor
+        # (This keeps the cached block around so it can be reused)
+        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
+                         self._block_tracker[block_id].last_accessed)
+
+        # Stop tracking the block
+        self._untrack_block_id(block_id)
 
         block.block_id = None
 
-    def _free_block_id_for_block(self, block_id: BlockId,
-                                 block: Block) -> None:
-        assert isinstance(block, PrefixCachingBlock)
-
-        # if we comes from promote_to_immutable_block, it means that
-        # block.content_hash is never None.
-        # However we need to release the same content block, so that
-        # physical block could get reused.
-        if block.block_id != block_id or block.content_hash is None:
-            refcount = self._refcounter.get(block_id)
-            # We have fork case where block would get more than one ref,
-            # so we cannot free it from tracking if ref cnt large than 1
-            assert block.block_id is not None
-            refcount = self._refcounter.get(block.block_id)
-            if refcount == 1:
-                del self._blocks[block.block_id]
-
-            return self._hashless_allocator.free(block)
+    def _decr_refcount_hashless_block(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
 
-        refcount = self._refcounter.decr(block_id)
+        # We may have a fork case where block is shared,
+        # in which case, we cannot remove it from tracking
+        refcount = self._refcounter.get(block_id)
+        if refcount == 1:
+            self._untrack_block_id(block_id)
 
-        # If no longer used, add the block to the evictor.
-        if refcount == 0:
-            assert block.content_hash in self._cached_blocks
-            assert block.block_id is not None
-            del self._blocks[block.block_id]
-            self.evictor.add(block.block_id, block.content_hash,
-                             block.num_tokens_total, block.last_accessed)
+        # Decrement refcount of the block_id, but do not free the block object
+        # itself (will be handled by the caller)
+        self._hashless_allocator.free(block, keep_block_object=True)
+
+    def _allocate_block_id(self) -> BlockId:
+        """First tries to allocate a block id from the hashless allocator,
+        and if there are no blocks, then tries to evict an unused cached block.
+        """
+        hashless_block_id = self._maybe_allocate_hashless_block_id()
+        if hashless_block_id is not None:
+            return hashless_block_id
+
+        evicted_block_id = self._maybe_allocate_evicted_block_id()
+        if evicted_block_id is not None:
+            return evicted_block_id
+
+        # No block available in hashless allocator, nor in unused cache blocks.
+        raise BlockAllocator.NoFreeBlocksError()
+
+    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
+        try:
+            # Allocate mutable block and extract its block_id
+            block = self._hashless_allocator.allocate_mutable_block(
+                prev_block=None)
+            block_id = block.block_id
+            self._block_pool.free_block(block)
+
+            self._track_block_id(block_id, computed=False)
+            return block_id
+        except BlockAllocator.NoFreeBlocksError:
+            return None
+
+    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
+        if self.evictor.num_blocks == 0:
+            return None
+
+        # Here we get an evicted block, which is only added
+        # into evictor if its ref counter is 0
+        # and since its content would be changed, we need
+        # to remove it from _cached_blocks's tracking list
+        block_id, content_hash_to_evict = self.evictor.evict()
+
+        # Sanity checks
+        assert content_hash_to_evict in self._cached_blocks
+        _block_id = self._cached_blocks[content_hash_to_evict]
+        assert self._refcounter.get(_block_id) == 0
+        assert _block_id == block_id
+
+        self._cached_blocks.pop(content_hash_to_evict)
+
+        self._refcounter.incr(block_id)
+        self._track_block_id(block_id, computed=False)
+
+        return block_id
+
+    def _free_block_id(self, block: Block) -> None:
+        """Decrements the refcount of the block. The block may be in two 
+        possible states: (1) immutable/cached or (2) mutable/hashless. 
+        In the first case, the refcount is decremented directly and the block
+        may be possibly added to the evictor. In other case, hashless 
+        allocator free(..) with keep_block_object=True is called to only free
+        the block id (since the block object may be reused by the caller)
+        """
+        block_id = block.block_id
+        assert block_id is not None, "Freeing unallocated block is undefined"
+
+        if block.content_hash is not None:
+            # Immutable: This type of block is always cached, and we want to
+            # keep it in the evictor for future reuse
+            self._decr_refcount_cached_block(block)
+        else:
+            # Mutable: This type of block is not cached, so we release it
+            # directly to the hashless allocator
+            self._decr_refcount_hashless_block(block)
+
+        assert block.block_id is None
+
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        """Release the block (look at free_block_id(..) docs)
+        """
+        # Release the physical block index
+        self._free_block_id(block)
+
+        # Release the block object to the pool
+        if not keep_block_object:
+            self._block_pool.free_block(block)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
@@ -274,17 +359,20 @@ def fork(self, last_block: Block) -> List[Block]:
         forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
-            refcount = self._refcounter.incr(block.block_id)
-            assert refcount != 1, "can't fork free'd block"
-
-            forked_blocks.append(
-                self._create_block(
-                    prev_block=prev_block,
-                    token_ids=block.token_ids,
-                    block_id=block.block_id,
-                    block_size=self._block_size,
-                    allocator=self,
-                ))
+            block_id = block.block_id
+            assert block_id is not None
+
+            refcount = self._refcounter.incr(block_id)
+            assert refcount != 1, "can't fork free'd block_id = {}".format(
+                block_id)
+
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block_id)
+
+            forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
 
         return forked_blocks
@@ -329,7 +417,7 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
 
         Note that if we already have a cached block with the same content, we
         will replace the newly-promoted block's mapping with the existing cached
-        block.
+        block id.
 
         Args:
             block: The mutable block to be promoted.
@@ -338,23 +426,30 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
             BlockId: Either the original block index, or the block index of
                 the previously cached block matching the same content.
         """
+        # Ensure block can be promoted
         assert block.content_hash is not None
         assert block.block_id is not None
         assert self._refcounter.get(block.block_id) > 0
 
-        # If the content hash does not have a corresponding cached block,
-        # set this block as the cached block.
         if block.content_hash not in self._cached_blocks:
+            # No cached content hash => Set this block as cached
+            # (Note that this block is not computed yet =>
+            #  Will be computed after free())
             self._cached_blocks[block.content_hash] = block.block_id
-        else:
-            self._free_block_id_for_block(
-                self._cached_blocks[block.content_hash], block)
-            self._incr_refcount_cached_block(
-                block, self._cached_blocks[block.content_hash])
+            return block.block_id
 
-        return self._cached_blocks[block.content_hash]
+        # Reuse the cached content hash
+        self._decr_refcount_hashless_block(block)
+        block.block_id = self._cached_blocks[block.content_hash]
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+        # Increment refcount of the cached block and (possibly) restore
+        # it from the evictor.
+        # Note that in this case, the block is marked as computed
+        self._incr_refcount_cached_block(block)
+
+        return block.block_id
+
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """Performs a copy-on-write operation on the given block if it is not
         appendable.
 
@@ -362,11 +457,22 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
             block (Block): The block to check for copy-on-write.
 
         Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
                 no copy-on-write was necessary.
         """
-        return self._cow_tracker.cow_block_if_not_appendable(block)
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
 
     def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
         """Returns the copy-on-write source->destination mapping and clears it.
@@ -386,8 +492,8 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
         """
 
         for block_id in block_ids:
-            if block_id in self._blocks:
-                self._blocks[block_id].last_accessed = now
+            if self._block_tracker[block_id].active:
+                self._block_tracker[block_id].last_accessed = now
             elif block_id in self.evictor:
                 self.evictor.update(block_id, now)
             else:
@@ -395,25 +501,46 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
                     "Mark block as accessed which is not belonged to GPU")
 
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as computed, used in prefix caching."""
+        raise NotImplementedError("Marking as computed is incremental")
 
-        for block_id in block_ids:
-            if block_id in self._blocks:
-                # only those full block is valid for prefix caching
-                if self._blocks[block_id].is_full:
-                    self._blocks[block_id].computed = True
-            elif block_id not in self.evictor:
-                raise ValueError(f"Mark {block_id=} as computed which "
-                                 "is not belonged to GPU")
+    def _track_block_id(self, block_id: Optional[BlockId],
+                        computed: bool) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].enable()
+        self._block_tracker[block_id].computed = computed
+
+    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].disable()
 
     def block_is_computed(self, block_id: int) -> bool:
-        if block_id in self._blocks:
-            return self._blocks[block_id].computed
+        if self._block_tracker[block_id].active:
+            return self._block_tracker[block_id].computed
         else:
             return block_id in self.evictor
 
+    def get_computed_block_ids(self,
+                               prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool = True) -> List[int]:
+        prev_prefix_size = len(prev_computed_block_ids)
+        cur_size = len(block_ids)
+        if skip_last_block_id:
+            cur_size -= 1
+
+        # Sanity checks
+        assert cur_size >= 0
+        assert prev_prefix_size <= cur_size
+
+        ret = prev_computed_block_ids
+        for i in range(prev_prefix_size, cur_size):
+            block_id = block_ids[i]
+            if self.block_is_computed(block_id):
+                ret.append(block_id)
+        return ret
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Return the block ids that are common for a given sequence group.
 
         Only those blocks that are immutable and already be marked
@@ -424,15 +551,13 @@ def get_common_computed_block_ids(
         # prompt is cached. This would cause erroneous behavior in model
         # runner.
 
-        ids_list = [
-            list(
-                takewhile(lambda block_id: self.block_is_computed(block_id),
-                          seq[:-1])) for seq in seq_block_ids
-        ]
         # It returns a list of int although type annotation says list of string.
+        if len(computed_seq_block_ids) == 1:
+            return computed_seq_block_ids[0]
+
         return commonprefix([
-            ids for ids in ids_list  # type: ignore
-            if ids != []
+            ids for ids in computed_seq_block_ids  # type: ignore
+            if ids
         ])
 
     def get_num_blocks_touched(self,
@@ -473,10 +598,10 @@ def swap_out(self, blocks: List[Block]) -> None:
             blocks: List of blocks to be swapped out.
         """
         for block in blocks:
-            self.free(block)
+            self._free_block_id(block)
 
     def swap_in(self, blocks: List[Block]) -> None:
-        """Execute the swap int actions. Change the block id from 
+        """Execute the swap in actions. Change the block id from 
         old allocator to current allocator for each block to finish 
         the block table update. 
 
@@ -484,13 +609,22 @@ def swap_in(self, blocks: List[Block]) -> None:
             blocks: List of blocks to be swapped in.
         """
         for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
             if block.is_full:
-                alloc = self.allocate_immutable(block.prev_block,
-                                                block.token_ids)
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
             else:
-                alloc = self.allocate_mutable(block.prev_block)
-                alloc.append_token_ids(block.token_ids)
-            block.block_id = alloc.block_id
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
 
 
 class PrefixCachingBlock(Block):
@@ -507,7 +641,7 @@ class PrefixCachingBlock(Block):
         token_ids (List[int]): The initial token IDs to be stored in the block.
         block_size (int): The maximum number of token IDs that can be stored in
             the block.
-        prefix_caching_allocator (BlockAllocator): The prefix
+        allocator (BlockAllocator): The prefix
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
@@ -518,31 +652,55 @@ def __init__(
         prev_block: Optional[Block],
         token_ids: List[int],
         block_size: int,
-        prefix_caching_allocator: BlockAllocator,
+        allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
     ):
-        assert isinstance(prefix_caching_allocator,
-                          PrefixCachingBlockAllocator), (
-                              "Currently this class is only tested with "
-                              "PrefixCachingBlockAllocator.")
+        assert isinstance(allocator, PrefixCachingBlockAllocator), (
+            "Currently this class is only tested with "
+            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
+                allocator))
         assert_prefix_caching_block_or_none(prev_block)
 
         self._prev_block = prev_block
         self._cached_content_hash: Optional[int] = None
-        self._cached_num_tokens_total: Optional[int] = None
-        self._prefix_caching_allocator = prefix_caching_allocator
+        self._cached_num_tokens_total: int = 0
+        self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
 
-        self._block = NaiveBlock(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            block_id=block_id,
-            allocator=prefix_caching_allocator,
-            _cow_target=self,
-        )
+        # On the first time, we create the block object, and next we only
+        # reinitialize it
+        if hasattr(self, "_block"):
+            self._block.__init__(  # type: ignore[has-type]
+                prev_block=prev_block,
+                token_ids=token_ids,
+                block_size=block_size,
+                block_id=block_id,
+                allocator=self._allocator)
+        else:
+            self._block = NaiveBlock(prev_block=prev_block,
+                                     token_ids=token_ids,
+                                     block_size=block_size,
+                                     block_id=block_id,
+                                     allocator=self._allocator)
+
+        self._update_num_tokens_total()
+
+    def _update_num_tokens_total(self):
+        """Incrementally computes the number of tokens that there is
+        till the current block (included)
+        """
+        res = 0
+
+        # Add all previous blocks
+        if self._prev_block is not None:
+            res += self._prev_block.num_tokens_total
+
+        # Add current block
+        res += len(self.token_ids)
+
+        self._cached_num_tokens_total = res
 
     @property
     def computed(self) -> bool:
@@ -564,22 +722,28 @@ def append_token_ids(self, token_ids: List[int]) -> None:
         """Appends the given token IDs to the block and registers the block as
         immutable if the block becomes full.
 
-        Internally, the naive block handles CoW.
-
         Args:
             token_ids (List[int]): The token IDs to be appended to the block.
         """
-        assert token_ids
+        # Ensure this is mutable block (not promoted)
+        assert self.content_hash is None
+        assert not self.computed
+
+        if len(token_ids) == 0:
+            return
+
+        # Ensure there are input tokens
+        assert token_ids, "Got token_ids = {}".format(token_ids)
 
-        # naive block handles CoW.
+        # Naive block handles CoW.
         self._block.append_token_ids(token_ids)
+        self._update_num_tokens_total()
 
         # If the content hash is present, then the block can be made immutable.
         # Register ourselves with the allocator, potentially replacing the
         # physical block index.
         if self.content_hash is not None:
-            self.block_id = (self._prefix_caching_allocator.
-                             promote_to_immutable_block(self))
+            self.block_id = self._allocator.promote_to_immutable_block(self)
 
     @property
     def block_id(self) -> Optional[int]:
@@ -599,23 +763,6 @@ def num_empty_slots(self) -> int:
 
     @property
     def num_tokens_total(self) -> int:
-        """return the total tokens so far.
-
-        Here we iterate the block chain till to the first block, while
-        cache the result in local to prevent repeated computations.
-        """
-        if self._cached_num_tokens_total is not None:
-            return self._cached_num_tokens_total
-
-        _block: Optional[Block] = self
-        self._cached_num_tokens_total = 0
-
-        # TODO: current implement here take O(N^2), we expect future
-        # we have O(1) here
-        while _block is not None:
-            self._cached_num_tokens_total += len(_block.token_ids)
-            _block = _block.prev_block
-
         return self._cached_num_tokens_total
 
     @property
@@ -638,7 +785,6 @@ def content_hash(self) -> Optional[int]:
         For the content-based hash to be defined, the current block must be
         full.
         """
-
         # If the hash is already computed, return it.
         if self._cached_content_hash is not None:
             return self._cached_content_hash
@@ -688,7 +834,129 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
         return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
 
 
+class ComputedBlocksTracker:
+    """Handles caching of per-sequence computed block ids. 
+        When a sequence appears for the first time, it traverses all of the 
+        blocks and detects the prefix of blocks that is computed. On the
+        subsequent times, it only traverses the new blocks that were added 
+        and updates the already recorded prefix of blocks with the newly 
+        computed blocks.
+
+        To avoid redundant traversals, the algorithm also detects when there
+        is a "gap" in the computed prefix. For example, if we have blocks =
+        [1,2,3,4,5], and we have detected [1,2,3] as the computed prefix, then
+        we won't try to add more computed blocks to [1,2,3] in this sequence
+        iteration, and will add more computed blocks only after the sequence is
+        freed and reused again.
+
+        Note that currently, for a given sequence, we also skip the last 
+        block id for caching purposes, to avoid caching of a full sequence
+    """
+
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._cached_computed_seq_blocks: Dict[int, Tuple[List[int],
+                                                          bool]] = {}
+
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._cached_computed_seq_blocks
+        self._cached_computed_seq_blocks[seq_id] = ([], False)
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._cached_computed_seq_blocks
+        del self._cached_computed_seq_blocks[seq_id]
+
+    def get_cached_computed_blocks_and_update(
+            self, seq_id: int, block_ids: List[int]) -> List[int]:
+        """ Look at the class documentation for details
+        """
+        # Ensure seq_id is already tracked
+        assert seq_id in self._cached_computed_seq_blocks
+
+        # Get cached data (may be empty on the first time)
+        prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
+            seq_id]
+
+        if has_gap:
+            # When gap is detected, we do not add more computed blocks at this
+            # sequence iteration
+            return prev_computed_block_ids
+
+        # We do not consider the last block id for caching purposes.
+        num_cur_blocks = len(block_ids) - 1
+        assert num_cur_blocks >= 0
+
+        if len(prev_computed_block_ids) >= num_cur_blocks:
+            # Cache HIT
+            assert len(prev_computed_block_ids) == num_cur_blocks
+            return prev_computed_block_ids
+
+        # If here, then we may possibly add more computed blocks. As a result,
+        # traverse the additional blocks after prev_computed_block_ids to
+        # detect more computed blocks and add them.
+
+        # Incremental init for seq_id => Look only at the new blocks
+        computed_block_ids = self._allocator.get_computed_block_ids(  # noqa: E501
+            prev_computed_block_ids,
+            block_ids,
+            skip_last_block_id=
+            True,  # We skip last block id to avoid caching of full seq
+        )
+
+        # Detect if there is a "gap"
+        has_gap = len(computed_block_ids) < num_cur_blocks
+
+        # Record
+        self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
+                                                    has_gap)
+
+        return computed_block_ids
+
+
+class LastAccessBlocksTracker:
+    """Manages the last access time of the tracked sequences, in order to allow
+    an efficient update of allocator's block last access times
+    """
+
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._seq_last_access: Dict[int, Optional[float]] = {}
+
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._seq_last_access
+        self._seq_last_access[seq_id] = None
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._seq_last_access
+        del self._seq_last_access[seq_id]
+
+    def update_last_access(self, seq_id: int, time: float) -> None:
+        assert seq_id in self._seq_last_access
+        self._seq_last_access[seq_id] = time
+
+    def update_seq_blocks_last_access(self, seq_id: int,
+                                      block_ids: List[int]) -> None:
+        assert seq_id in self._seq_last_access
+
+        ts = self._seq_last_access[seq_id]
+
+        if ts is None:
+            # No last access was recorded, no need to update.
+            return
+
+        self._allocator.mark_blocks_as_accessed(block_ids, ts)
+
+
 def assert_prefix_caching_block_or_none(block: Optional[Block]):
     if block is None:
         return
-    assert isinstance(block, PrefixCachingBlock)
+    assert isinstance(block,
+                      PrefixCachingBlock), "Got block = {}".format(block)
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 4010aaf02b828..e29eba375f4dd 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -262,8 +262,7 @@ def __init__(
         self.cross_block_tables: Dict[str, BlockTable] = {}
 
     def _get_seq_num_required_blocks(self, seq: Sequence) -> int:
-        return 0 if seq is None \
-            else len(seq.logical_token_blocks)
+        return 0 if seq is None else seq.n_blocks
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -298,7 +297,7 @@ def _allocate_sequence(self, \
                            ref_count: int, \
                            is_encoder_decoder: bool = True) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
-        num_prompt_blocks = len(seq.logical_token_blocks)
+        num_prompt_blocks = seq.n_blocks
 
         block_table: BlockTable = []
         for logical_idx in range(num_prompt_blocks):
@@ -367,7 +366,7 @@ def _promote_last_block(
 
         # Compute a new hash for the block so that it can be shared by other
         # Sequences
-        new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+        new_hash = seq.hash_of_block(seq.n_blocks - 1)
 
         # if new_hash is already in the cached table, then free last_block
         # and return the cached version
@@ -407,10 +406,10 @@ def _allocate_last_physical_block(
         if not self.enable_caching:
             return self.gpu_allocator.allocate()
         block_hash: Optional[int] = None
+        n_blocks = seq.n_blocks
         if (self._is_last_block_full(seq)):
-            block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
-        num_hashed_tokens = seq.num_hashed_tokens_of_block(
-            len(seq.logical_token_blocks) - 1)
+            block_hash = seq.hash_of_block(n_blocks - 1)
+        num_hashed_tokens = seq.num_hashed_tokens_of_block(n_blocks - 1)
 
         # num_hashed_tokens is used to compute future hashes
         # (e.g. in the hashing function, it is used to ask the sequence for
@@ -429,12 +428,12 @@ def append_slots(
         num_lookahead_slots: int = 0,
     ) -> List[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
-        logical_blocks = seq.logical_token_blocks
+        n_blocks = seq.n_blocks
         block_table = self.block_tables[seq.seq_id]
         # If we need to allocate a new physical block
-        if len(block_table) < len(logical_blocks):
+        if len(block_table) < n_blocks:
             # Currently this code only supports adding one physical block
-            assert len(block_table) == len(logical_blocks) - 1
+            assert len(block_table) == n_blocks - 1
 
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
@@ -472,6 +471,9 @@ def append_slots(
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         # NOTE: fork does not allocate a new physical block.
         # Thus, it is always safe from OOM.
+        if parent_seq.seq_id not in self.block_tables:
+            # Parent sequence has either been freed or never existed.
+            return
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
         # When using a sliding window, blocks will be eventually reused.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 309775237a715..b48ea1b19b82a 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -7,6 +7,8 @@
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  LastAccessBlocksTracker)
 from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
@@ -100,6 +102,11 @@ def __init__(
         self.block_tables: Dict[SeqId, BlockTable] = {}
         self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
 
+        self._computed_blocks_tracker = ComputedBlocksTracker(
+            self.block_allocator)
+        self._last_access_blocks_tracker = LastAccessBlocksTracker(
+            self.block_allocator)
+
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
@@ -157,10 +164,18 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         block_table: BlockTable = self._allocate_sequence(seq)
         self.block_tables[seq.seq_id] = block_table
 
+        # Track seq
+        self._computed_blocks_tracker.add_seq(seq.seq_id)
+        self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
         # Assign the block table for each sequence.
         for seq in waiting_seqs[1:]:
             self.block_tables[seq.seq_id] = block_table.fork()
 
+            # Track seq
+            self._computed_blocks_tracker.add_seq(seq.seq_id)
+            self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
         # Allocate cross-attention block table for encoder sequence
         #
         # NOTE: Here we assume that all sequences in the group have the same
@@ -224,11 +239,23 @@ def append_slots(
         return new_cows
 
     def free(self, seq: Sequence) -> None:
-        if seq.seq_id not in self.block_tables:
+        seq_id = seq.seq_id
+
+        if seq_id not in self.block_tables:
             # Already freed or haven't been scheduled yet.
             return
-        self.block_tables[seq.seq_id].free()
-        del self.block_tables[seq.seq_id]
+
+        # Update seq block ids with the latest access time
+        self._last_access_blocks_tracker.update_seq_blocks_last_access(
+            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
+
+        # Untrack seq
+        self._last_access_blocks_tracker.remove_seq(seq_id)
+        self._computed_blocks_tracker.remove_seq(seq_id)
+
+        # Free table/blocks
+        self.block_tables[seq_id].free()
+        del self.block_tables[seq_id]
 
     def free_cross(self, seq_group: SequenceGroup) -> None:
         request_id = seq_group.request_id
@@ -239,9 +266,7 @@ def free_cross(self, seq_group: SequenceGroup) -> None:
         del self.cross_block_tables[request_id]
 
     def get_block_table(self, seq: Sequence) -> List[int]:
-        assert seq.seq_id in self.block_tables
         block_ids = self.block_tables[seq.seq_id].physical_block_ids
-        assert all(b is not None for b in block_ids)
         return block_ids  # type: ignore
 
     def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
@@ -252,20 +277,14 @@ def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
         return block_ids  # type: ignore
 
     def access_all_blocks_in_seq(self, seq: Sequence, now: float):
-        # Update the last accessed time of all the blocks accessed
-        # in this step.
-        # And the accessed time is only useful for prefix caching now,
-        # as it support internal evictor policy for which cached
-        # block could be refilled, to keep cached content could be reused
-        # at max extend.
         if self.enable_caching:
-            block_table = self.block_tables[seq.seq_id]
-            block_ids: List[Optional[int]] = []
-            for block_id in block_table.physical_block_ids:
-                block_ids.append(block_id)
-            self.block_allocator.mark_blocks_as_accessed(
-                block_ids,  # type: ignore
-                now)
+            # Record the latest access time for the sequence. The actual update
+            # of the block ids is deferred to the sequence free(..) call, since
+            # only during freeing of block ids, the blocks are actually added to
+            # the evictor (which is when the most updated time is required)
+            # (This avoids expensive calls to mark_blocks_as_accessed(..))
+            self._last_access_blocks_tracker.update_last_access(
+                seq.seq_id, now)
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # The only need for mark block as computed is for prefix caching,
@@ -285,17 +304,29 @@ def get_common_computed_block_ids(
         This method determines which blocks can be safely skipped for all
         sequences in the sequence group.
         """
-        seq_block_ids = [
-            self.block_tables[seq.seq_id].physical_block_ids for seq in seqs
-        ]
+        computed_seq_block_ids = []
+        for seq in seqs:
+            computed_seq_block_ids.append(
+                self._computed_blocks_tracker.
+                get_cached_computed_blocks_and_update(
+                    seq.seq_id,
+                    self.block_tables[seq.seq_id].physical_block_ids))
+
         # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
         return self.block_allocator.get_common_computed_block_ids(
-            seq_block_ids)  # type: ignore
+            computed_seq_block_ids)  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        if parent_seq.seq_id not in self.block_tables:
+            # Parent sequence has either been freed or never existed.
+            return
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
+        # Track child seq
+        self._computed_blocks_tracker.add_seq(child_seq.seq_id)
+        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
+
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> AllocStatus:
         """Returns the AllocStatus for the given sequence_group 
@@ -323,19 +354,31 @@ def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
             List[Tuple[int, int]]: The mapping of swapping block from CPU 
                 to GPU.
         """
-        blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
-        current_swap_mapping = self.block_allocator.swap(
-            blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
-
-        block_number_mapping = {
-            self.block_allocator.get_physical_block_id(Device.CPU,
-                                                       cpu_block_id):
-            self.block_allocator.get_physical_block_id(Device.GPU,
-                                                       gpu_block_id)
-            for cpu_block_id, gpu_block_id in current_swap_mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.CPU,
+                                                         dst_device=Device.GPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id)
+                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         """Returns whether we can swap out the given sequence_group 
@@ -355,7 +398,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
             return True
         return False
 
-    def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         """Returns the block id mapping (from GPU to CPU) generated by
         swapping out the given sequence_group with num_lookahead_slots.
 
@@ -366,19 +409,31 @@ def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
             List[Tuple[int, int]]: The mapping of swapping block from 
                 GPU to CPU.
         """
-        blocks = self._get_blocks_for_swap(sequence_group,
-                                           SequenceStatus.RUNNING)
-        current_swap_mapping = self.block_allocator.swap(
-            blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU)
-        block_number_mapping = {
-            self.block_allocator.get_physical_block_id(Device.GPU,
-                                                       gpu_block_id):
-            self.block_allocator.get_physical_block_id(Device.CPU,
-                                                       cpu_block_id)
-            for gpu_block_id, cpu_block_id in current_swap_mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.GPU,
+                                                         dst_device=Device.CPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id)
+                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
 
     def get_num_free_gpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.GPU)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 48c34625c08ae..6e59c5e0f74f3 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -11,6 +11,7 @@
 from vllm.core.policy import Policy, PolicyFactory
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceStatus)
 
@@ -139,6 +140,8 @@ def __post_init__(self):
         if self.num_loras > 0:
             self._sort_by_lora_ids()
 
+        self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
+
     def is_empty(self) -> bool:
         # NOTE: We do not consider the ignored sequence groups.
         return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
@@ -157,6 +160,14 @@ def lora_requests(self) -> Set[LoRARequest]:
             if g.seq_group.lora_request is not None
         }
 
+    @property
+    def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
+        return {
+            g.seq_group.prompt_adapter_request
+            for g in self.scheduled_seq_groups
+            if g.seq_group.prompt_adapter_request is not None
+        }
+
 
 @dataclass
 class SchedulerRunningOutputs:
@@ -256,6 +267,7 @@ def __init__(
         scheduler_config: SchedulerConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        pipeline_parallel_size: int = 1,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -273,11 +285,19 @@ def __init__(
         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
             version)
 
+        num_gpu_blocks = cache_config.num_gpu_blocks
+        if num_gpu_blocks:
+            num_gpu_blocks //= pipeline_parallel_size
+
+        num_cpu_blocks = cache_config.num_cpu_blocks
+        if num_cpu_blocks:
+            num_cpu_blocks //= pipeline_parallel_size
+
         # Create the block space manager.
         self.block_manager = BlockSpaceManagerImpl(
             block_size=self.cache_config.block_size,
-            num_gpu_blocks=self.cache_config.num_gpu_blocks,
-            num_cpu_blocks=self.cache_config.num_cpu_blocks,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching)
 
@@ -290,7 +310,10 @@ def __init__(
         # Sequence groups in the SWAPPED state.
         # Contain decode requests that are swapped out.
         self.swapped: Deque[SequenceGroup] = deque()
-
+        # Sequence groups finished requests ids since last step iteration.
+        # It lets the model know that any state associated with these requests
+        # can and must be released after the current step.
+        self._finished_requests_ids: List[str] = list()
         # Time at previous scheduling step
         self.prev_time = 0.0
         # Did we schedule a prompt at previous step?
@@ -351,6 +374,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
             for aborted_group in aborted_groups:
                 # Remove the sequence group from the state queue.
                 state_queue.remove(aborted_group)
+                self._finished_requests_ids.append(aborted_group.request_id)
                 for seq in aborted_group.get_seqs():
                     if seq.is_finished():
                         continue
@@ -364,6 +388,12 @@ def has_unfinished_seqs(self) -> bool:
     def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
 
+    def get_and_reset_finished_requests_ids(self) -> List[str]:
+        """Flushes the list of request ids of previously finished seq_groups."""
+        finished_requests_ids = self._finished_requests_ids
+        self._finished_requests_ids = list()
+        return finished_requests_ids
+
     def _schedule_running(
         self,
         running_queue: deque,
@@ -1006,6 +1036,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 # `multi_modal_data` will be None.
                 multi_modal_data=seq_group.multi_modal_data
                 if scheduler_outputs.num_prefill_groups > 0 else None,
+                prompt_adapter_request=seq_group.prompt_adapter_request,
             )
             seq_group_metadata_list.append(seq_group_metadata)
 
@@ -1027,6 +1058,11 @@ def free_seq(self, seq: Sequence) -> None:
         self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
+        for queue in [self.running, self.swapped, self.waiting]:
+            self._finished_requests_ids += [
+                seq_group.request_id for seq_group in queue
+                if seq_group.is_finished()
+            ]
         self.running = deque(seq_group for seq_group in self.running
                              if not seq_group.is_finished())
 
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 24308235c4a48..5cac3c1d57bca 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -4,6 +4,9 @@
 """
 
 import ctypes
+import glob
+import os
+import sys
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -33,6 +36,26 @@ class Function:
     argtypes: List[Any]
 
 
+def get_pytorch_default_cudart_library_path() -> str:
+    # code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
+    lib_folder = "cuda_runtime"
+    lib_name = "libcudart.so.*[0-9]"
+    lib_path = None
+    for path in sys.path:
+        nvidia_path = os.path.join(path, "nvidia")
+        if not os.path.exists(nvidia_path):
+            continue
+        candidate_lib_paths = glob.glob(
+            os.path.join(nvidia_path, lib_folder, "lib", lib_name))
+        if candidate_lib_paths and not lib_path:
+            lib_path = candidate_lib_paths[0]
+        if lib_path:
+            break
+    if not lib_path:
+        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+    return lib_path
+
+
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
@@ -77,9 +100,7 @@ class CudaRTLibrary:
 
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
-            assert torch.version.cuda is not None
-            major_version = torch.version.cuda.split(".")[0]
-            so_file = f"libcudart.so.{major_version}"
+            so_file = get_pytorch_default_cudart_library_path()
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index b0cb21a022789..a4f30808d32e1 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -9,68 +9,20 @@
 from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import cuda_device_count_stateless, is_full_nvlink
 
 try:
-    import pynvml
-
-    # Simulate ImportError if custom_ar ops are not supported.
-    if not ops.is_custom_op_supported("_C_custom_ar::meta_size"):
-        raise ImportError("custom_ar", __file__)
-
+    assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
     custom_ar = True
-
-    @contextmanager
-    def _nvml():
-        try:
-            pynvml.nvmlInit()
-            yield
-        finally:
-            pynvml.nvmlShutdown()
-
-except ImportError:
-    # For AMD GPUs
+except Exception:
+    # For AMD GPUs and CPUs
     custom_ar = False
-    pynvml = None
-
-    @contextmanager
-    def _nvml():
-        try:
-            yield
-        finally:
-            pass
-
 
 logger = init_logger(__name__)
 
 
-@_nvml()
-def _is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
-    so it works on real physical device ids.
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-
-
 def _can_p2p(rank: int, world_size: int) -> bool:
     for i in range(world_size):
         if i == rank:
@@ -112,7 +64,7 @@ def __init__(self,
         assert dist.get_backend(group) != dist.Backend.NCCL, (
             "CustomAllreduce should be attached to a non-NCCL group.")
 
-        if not is_in_the_same_node(group):
+        if not all(in_the_same_node_as(group, source_rank=0)):
             # No need to initialize custom allreduce for multi-node case.
             logger.warning(
                 "Custom allreduce is disabled because this process group"
@@ -161,7 +113,7 @@ def __init__(self,
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        full_nvlink = _is_full_nvlink(physical_device_ids)
+        full_nvlink = is_full_nvlink(physical_device_ids)
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index d3e41fa710676..d27d7ee9a2496 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -13,7 +13,8 @@
 import vllm.envs as envs
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -24,7 +25,8 @@ def producer(batch_src: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for i in batch_src:
@@ -56,7 +58,8 @@ def consumer(batch_tgt: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for j in batch_tgt:
@@ -123,7 +126,7 @@ def can_actually_p2p(
     processes for testing all pairs of GPUs in batch. The trick is to reset
     the device after each test (which is not available in PyTorch).
     """  # noqa
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
@@ -186,10 +189,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
     cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
-    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
-    path = os.path.expanduser(
-        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
-    )
+
+    path = os.path.join(
+        envs.VLLM_CACHE_ROOT,
+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
     os.makedirs(os.path.dirname(path), exist_ok=True)
     from vllm.distributed.parallel_state import get_world_group
     if ((not is_distributed or get_world_group().local_rank == 0)
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 83eec264b6f81..7319566545678 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -121,10 +121,7 @@ def all_reduce(self,
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
 
-    def send(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             stream=None):
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -132,16 +129,11 @@ def send(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if dst is None:
-            dst = (self.rank + 1) % self.world_size
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
-    def recv(self,
-             tensor: torch.Tensor,
-             src: Optional[int] = None,
-             stream=None):
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -149,8 +141,6 @@ def recv(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if src is None:
-            src = (self.rank - 1) % self.world_size
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 119befcf64052..75d84c7a71bc3 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,19 +1,28 @@
 import pickle
 import time
 from contextlib import contextmanager
+from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import Optional
+from typing import List, Optional
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
+from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context  # type: ignore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import get_ip, get_open_port
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
+# time to wait if the queue is full or empty
+# if we sleep for too short, it will consume too much CPU
+# if we sleep for too long, it will slow down the writer/reader
+# 0.1 us is a good balance
+RINGBUFFER_SLEEP_INTERVAL = 1e-7
+
 logger = init_logger(__name__)
 
 
@@ -48,6 +57,26 @@ def __init__(self,
         | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
         +--------------+--------------+--------------+-----+--------------+
 
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
         During creation, `name` is None and the buffer is created. We can pass the
         created object to other processes by pickling it. The other processes will
         get the name of the shared memory and open it, so that they can access the
@@ -79,12 +108,15 @@ def __init__(self,
             # created by the process. The following patch is a workaround.
             with patch("multiprocessing.resource_tracker.register",
                        lambda *args, **kwargs: None):
-                self.shared_memory = shared_memory.SharedMemory(name=name)
-            assert self.shared_memory.size == self.total_bytes_of_buffer
-            with memoryview(self.shared_memory.buf[self.metadata_offset:]
-                            ) as metadata_buffer:
-                tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8)
-                assert torch.all(tensor == 0)
+                try:
+                    self.shared_memory = shared_memory.SharedMemory(name=name)
+                    assert (
+                        self.shared_memory.size == self.total_bytes_of_buffer)
+                except FileNotFoundError:
+                    # we might deserialize the object in a different node
+                    # in this case, this object is not used,
+                    # and we should suppress the error
+                    pass
 
     def __reduce__(self):
         return (
@@ -94,9 +126,10 @@ def __reduce__(self):
         )
 
     def __del__(self):
-        self.shared_memory.close()
-        if self.is_creator:
-            self.shared_memory.unlink()
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_creator:
+                self.shared_memory.unlink()
 
     @contextmanager
     def get_data(self, current_idx: int):
@@ -113,24 +146,190 @@ def get_metadata(self, current_idx: int):
             yield buf
 
 
-class ShmRingBufferIO:
+@dataclass
+class Handle:
+    connect_ip: str
+    local_reader_ranks: List[int] = field(default_factory=list)
+
+    buffer: Optional[ShmRingBuffer] = None
+    local_subscribe_port: Optional[int] = None
+    local_sync_port: Optional[int] = None
+    remote_subscribe_port: Optional[int] = None
+    remote_sync_port: Optional[int] = None
+
+
+class MessageQueue:
+
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: Optional[List[int]] = None,
+        max_chunk_bytes: int = 1024 * 1024 * 10,
+        max_chunks: int = 10,
+        connect_ip: Optional[str] = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        if connect_ip is None:
+            connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1"
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
+                                        max_chunks)
+
+            self.local_socket = context.socket(PUB)
+            local_subscribe_port = get_open_port()
+            self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
+
+            self.local_sync_socket = context.socket(REP)
+            local_sync_port = get_open_port()
+            self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
+            self.current_idx = 0
+
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_port = None
+            local_sync_port = None
+            self.local_socket = None
+            self.local_sync_socket = None
+            self.current_idx = -1
+
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            self.remote_socket = context.socket(PUB)
+            remote_subscribe_port = get_open_port()
+            self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
+
+            self.remote_sync_socket = context.socket(REP)
+            remote_sync_port = get_open_port()
+            self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
+        else:
+            remote_subscribe_port = None
+            remote_sync_port = None
+            self.remote_socket = None
+            self.remote_sync_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+
+        self.handle = Handle(
+            connect_ip=connect_ip,
+            local_reader_ranks=local_reader_ranks,
+            buffer=self.buffer,
+            local_subscribe_port=local_subscribe_port,
+            local_sync_port=local_sync_port,
+            remote_subscribe_port=remote_subscribe_port,
+            remote_sync_port=remote_sync_port,
+        )
+
+        logger.info("vLLM message queue communication handle: %s", self.handle)
+
+    def export_handle(self) -> Handle:
+        return self.handle
 
-    def __init__(self, buffer: ShmRingBuffer, reader_rank: int):
-        self.buffer = buffer
-        self.reader_rank = reader_rank
-        self._is_writer = self.reader_rank == -1
-        self._is_reader = not self._is_writer
-        if self._is_reader:
-            assert 0 <= self.reader_rank < buffer.n_reader, \
-                (f"Invalid reader rank {self.reader_rank} for buffer"
-                f" created with {buffer.n_reader} readers")
-        self.current_idx = 0
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer is not None
+            self.buffer = handle.buffer
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            self.local_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}")
+
+            self.local_sync_socket = context.socket(REQ)
+            self.local_sync_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.local_sync_port}")
+
+            self.remote_socket = None
+            self.remote_sync_socket = None
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+            self.local_sync_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            self.remote_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}")
+
+            self.remote_sync_socket = context.socket(REQ)
+            self.remote_sync_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.remote_sync_port}")
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                recv = self.local_sync_socket.recv()
+                assert recv == b"READY"
+                self.local_sync_socket.send(b"READY")
+            if self.n_local_reader > 0:
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                recv = self.remote_sync_socket.recv()
+                assert recv == b"READY"
+                self.remote_sync_socket.send(b"READY")
+            if self.n_remote_reader > 0:
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            self.local_sync_socket.send(b"READY")
+            recv = self.local_sync_socket.recv()
+            assert recv == b"READY"
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            self.remote_sync_socket.send(b"READY")
+            recv = self.remote_sync_socket.recv()
+            assert recv == b"READY"
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
 
     @contextmanager
     def acquire_write(self):
         assert self._is_writer, "Only writers can acquire write"
-        start_index = self.current_idx
-        start_time = time.time()
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
@@ -138,19 +337,21 @@ def acquire_write(self):
                 written_flag = metadata_buffer[0]
                 if written_flag and read_count != self.buffer.n_reader:
                     # this block is written and not read by all readers
-                    # try to write to the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no empty block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if (time.monotonic() - start_time >
+                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is either
                 # (1) not written
@@ -163,40 +364,48 @@ def acquire_write(self):
                     yield buf
 
                 # caller has written to the buffer
-                # mark the block as written
-                metadata_buffer[0] = 1
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
                 for i in range(1, self.buffer.n_reader + 1):
                     # set read flag to 0, meaning it is not read yet
                     metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     @contextmanager
     def acquire_read(self):
-        assert self._is_reader, "Only readers can acquire read"
-        start_index = self.current_idx
-        start_time = time.time()
+        assert self._is_local_reader, "Only readers can acquire read"
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
-                read_flag = metadata_buffer[self.reader_rank + 1]
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
                 written_flag = metadata_buffer[0]
                 if not written_flag or read_flag:
                     # this block is either
                     # (1) not written
                     # (2) already read by this reader
-                    # try to read the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if (time.monotonic() - start_time >
+                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is not read by this reader
                 # let caller read from the buffer
@@ -205,27 +414,43 @@ def acquire_read(self):
 
                 # caller has read from the buffer
                 # set the read flag
-                metadata_buffer[self.reader_rank + 1] = 1
+                metadata_buffer[self.local_reader_rank + 1] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     def enqueue(self, obj):
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
-        if len(serialized_obj) > self.buffer.max_chunk_bytes:
-            raise RuntimeError(
-                f"{len(serialized_obj)=} larger than the allowed value "
-                f"{self.buffer.max_chunk_bytes},"
-                "Please increase the max_chunk_bytes parameter.")
-        with self.acquire_write() as buf:
-            buf[:len(serialized_obj)] = serialized_obj
+        if self.n_local_reader > 0:
+            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write() as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write() as buf:
+                    buf[0] = 0  # not overflow
+                    buf[1:len(serialized_obj) + 1] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
 
     def dequeue(self):
-        assert self._is_reader, "Only readers can dequeue"
-        with self.acquire_read() as buf:
-            # no need to know the size of serialized object
-            # pickle format itself contains the size information internally
-            # see https://docs.python.org/3/library/pickle.html
-            obj = pickle.loads(buf)
+        if self._is_local_reader:
+            with self.acquire_read() as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    # no need to know the size of serialized object
+                    # pickle format contains the size information internally
+                    # see https://docs.python.org/3/library/pickle.html
+                    obj = pickle.loads(buf[1:])
+            if overflow:
+                recv = self.local_socket.recv()
+                obj = pickle.loads(recv)
+        elif self._is_remote_reader:
+            recv = self.remote_socket.recv()
+            obj = pickle.loads(recv)
+        else:
+            raise RuntimeError("Only readers can dequeue")
         return obj
 
     def broadcast_object(self, obj=None):
@@ -235,25 +460,40 @@ def broadcast_object(self, obj=None):
         else:
             return self.dequeue()
 
+    @staticmethod
     def create_from_process_group(pg: ProcessGroup,
                                   max_chunk_bytes,
                                   max_chunks,
-                                  writer_rank=0) -> "ShmRingBufferIO":
+                                  writer_rank=0) -> "MessageQueue":
         group_rank = dist.get_rank(pg)
         group_world_size = dist.get_world_size(pg)
-        ranks_inside_group = list(range(group_world_size))
         global_ranks = dist.get_process_group_ranks(pg)
+
+        from vllm.distributed.parallel_state import in_the_same_node_as
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        same_node_ranks = [i for i, s in enumerate(status) if s]
         n_reader = group_world_size - 1
-        buffer: ShmRingBuffer
+        n_local_reader = len(same_node_ranks) - 1
+        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
-            dist.broadcast_object_list([buffer], src=global_ranks[writer_rank])
-            dist.barrier(pg)
-            return ShmRingBufferIO(buffer, -1)
+            buffer_io = MessageQueue(
+                n_reader=n_reader,
+                n_local_reader=n_local_reader,
+                local_reader_ranks=local_reader_ranks,
+                max_chunk_bytes=max_chunk_bytes,
+                max_chunks=max_chunks,
+            )
+            handle = buffer_io.export_handle()
+            dist.broadcast_object_list([handle],
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
         else:
             recv = [None]
-            dist.broadcast_object_list(recv, src=global_ranks[writer_rank])
-            dist.barrier(pg)
-            buffer = recv[0]  # type: ignore
-            rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
-            return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))
+            dist.broadcast_object_list(recv,
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
+            handle = recv[0]  # type: ignore
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        buffer_io.wait_until_ready()
+        return buffer_io
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188fadbb92a5..128096c88a8b1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,6 +20,7 @@
  steps.
 """
 import contextlib
+import pickle
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
@@ -28,6 +29,7 @@
 from unittest.mock import patch
 
 import torch
+import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
@@ -43,16 +45,22 @@ class GraphCaptureContext:
 
 
 def _split_tensor_dict(
-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
+
+    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
+    metadata will be "key1%key2".
     """
-    metadata_list = []
+    metadata_list: List[Tuple[str, Any]] = []
     tensor_list = []
     for key, value in tensor_dict.items():
+        assert "%" not in key, (
+            "Avoid having '%' in key "
+            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
@@ -60,13 +68,31 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
+                (prefix + key, TensorMetadata(device, value.dtype,
+                                              value.size())))
             tensor_list.append(value)
+        elif isinstance(value, dict):
+            if len(value) == 0:
+                metadata_list.append((prefix + key, value))
+            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
+                value, prefix + key + "%")
+            metadata_list.extend(inner_metadata_list)
+            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((key, value))
+            metadata_list.append((prefix + key, value))
     return metadata_list, tensor_list
 
 
+def _update_nested_dict(nested_dict, flattened_key, value):
+    key_splits = flattened_key.split("%")
+    cur_dict = nested_dict
+    for k in key_splits[:-1]:
+        if k not in cur_dict:
+            cur_dict[k] = {}
+        cur_dict = cur_dict[k]
+    cur_dict[key_splits[-1]] = value
+
+
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -98,7 +124,7 @@ class GroupCoordinator:
     # communicators are only created for world size > 1
     pynccl_comm: Optional[Any]  # PyNccl communicator
     ca_comm: Optional[Any]  # Custom allreduce communicator
-    shm_broadcaster: Optional[Any]  # shared memory broadcaster
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
         self,
@@ -107,6 +133,7 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
         use_pynccl: bool,
         use_custom_allreduce: bool,
+        use_message_queue_broadcaster: bool = False,
     ):
 
         self.rank = torch.distributed.get_rank()
@@ -164,11 +191,11 @@ def __init__(
             self.ca_comm = None
 
         from vllm.distributed.device_communicators.shm_broadcast import (
-            ShmRingBufferIO)
-        self.shm_broadcaster: Optional[ShmRingBufferIO] = None
-        if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
-            self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
-                self.cpu_group, 1 << 20, 6)
+            MessageQueue)
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6)
 
     @property
     def first_rank(self):
@@ -180,6 +207,16 @@ def last_rank(self):
         """Return the global rank of the last process in the group"""
         return self.ranks[-1]
 
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
     @property
     def next_rank(self):
         """Return the global rank of the process that follows the caller"""
@@ -341,9 +378,9 @@ def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
             return obj
-        if self.shm_broadcaster is not None:
-            assert src == 0, "Shared memory broadcaster only supports src=0"
-            return self.shm_broadcaster.broadcast_object(obj)
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
         if self.rank_in_group == src:
             torch.distributed.broadcast_object_list([obj],
                                                     src=self.ranks[src],
@@ -374,13 +411,77 @@ def broadcast_object_list(self,
                                                 group=self.device_group)
         return obj_list
 
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank_in_group, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=self.ranks[src],
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=self.ranks[src],
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
     def broadcast_tensor_dict(
         self,
-        tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
         src: int = 0,
         group: Optional[ProcessGroup] = None,
         metadata_group: Optional[ProcessGroup] = None
-    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Broadcast the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -391,10 +492,9 @@ def broadcast_tensor_dict(
         group = self.device_group
         metadata_group = self.cpu_group
         assert src < self.world_size, f"Invalid src rank ({src})"
-        src = self.ranks[src]
 
-        rank = self.rank
-        if rank == src:
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
             metadata_list: List[Tuple[Any, Any]] = []
             assert isinstance(
                 tensor_dict,
@@ -412,13 +512,13 @@ def broadcast_tensor_dict(
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
                     handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
+                                                         src=self.ranks[src],
                                                          group=metadata_group,
                                                          async_op=True)
                 else:
                     # use group for GPU tensors
                     handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
+                                                         src=self.ranks[src],
                                                          group=group,
                                                          async_op=True)
                 async_handles.append(handle)
@@ -436,29 +536,118 @@ def broadcast_tensor_dict(
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        tensor_dict[key] = tensor
+                        _update_nested_dict(tensor_dict, key, tensor)
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
                         handle = torch.distributed.broadcast(
                             tensor,
-                            src=src,
+                            src=self.ranks[src],
                             group=metadata_group,
                             async_op=True)
                     else:
                         # use group for GPU tensors
-                        handle = torch.distributed.broadcast(tensor,
-                                                             src=src,
-                                                             group=group,
-                                                             async_op=True)
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=group,
+                            async_op=True)
                     async_handles.append(handle)
-                    tensor_dict[key] = tensor
+                    _update_nested_dict(tensor_dict, key, tensor)
                 else:
-                    tensor_dict[key] = value
+                    _update_nested_dict(tensor_dict, key, value)
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
 
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    _update_nested_dict(tensor_dict, key, tensor)
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=group)
+                _update_nested_dict(tensor_dict, key, tensor)
+            else:
+                _update_nested_dict(tensor_dict, key, value)
+        return tensor_dict
+
     def barrier(self):
         """Barrier synchronization among the group.
         NOTE: don't use `device_group` here! `barrier` in NCCL is
@@ -468,6 +657,35 @@ def barrier(self):
         """
         torch.distributed.barrier(group=self.cpu_group)
 
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)
@@ -479,6 +697,8 @@ def destroy(self):
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
 
 
 _WORLD: Optional[GroupCoordinator] = None
@@ -489,6 +709,36 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def init_world_group(ranks: List[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+    )
+
+
+def init_model_parallel_group(
+    group_ranks: List[List[int]],
+    local_rank: int,
+    backend: str,
+    use_custom_allreduce: Optional[bool] = None,
+    use_message_queue_broadcaster: bool = False,
+) -> GroupCoordinator:
+    if use_custom_allreduce is None:
+        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=use_custom_allreduce,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
+    )
+
+
 _TP: Optional[GroupCoordinator] = None
 
 
@@ -577,13 +827,7 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _WORLD = GroupCoordinator(
-            group_ranks=[ranks],
-            local_rank=local_rank,
-            torch_distributed_backend=backend,
-            use_pynccl=False,
-            use_custom_allreduce=False,
-        )
+        _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -640,13 +884,12 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    _TP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+
+    # message queue broadcaster is only used in tensor model parallel group
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -658,13 +901,11 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    _PP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    # pipeline parallel does not need custom allreduce
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_custom_allreduce=False)
 
 
 def ensure_model_parallel_initialized(
@@ -700,6 +941,34 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
@@ -732,15 +1001,15 @@ def destroy_distributed_environment():
         torch.distributed.destroy_process_group()
 
 
-def is_in_the_same_node(pg: ProcessGroup):
+def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
     """
-    This is a collective operation that checks if all processes in the group
-    are in the same node. It tests if all processes are attached to the same
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
     assert torch.distributed.get_backend(
         pg) != torch.distributed.Backend.NCCL, (
-            "is_in_the_same_node should be tested with a non-NCCL group.")
+            "in_the_same_node_as should be tested with a non-NCCL group.")
     # local rank inside the group
     rank = torch.distributed.get_rank(group=pg)
     world_size = torch.distributed.get_world_size(group=pg)
@@ -756,19 +1025,19 @@ def is_in_the_same_node(pg: ProcessGroup):
 
     try:
         with contextlib.suppress(OSError):
-            if rank == 0:
+            if rank == source_rank:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
                 torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[0],
+                                                        src=ranks[source_rank],
                                                         group=pg)
-                is_in_the_same_node[0] = 1
+                is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
                 recv = [None]
                 torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[0],
+                                                        src=ranks[source_rank],
                                                         group=pg)
                 name = recv[0]
                 # fix to https://stackoverflow.com/q/62748654/9191338
@@ -789,8 +1058,8 @@ def is_in_the_same_node(pg: ProcessGroup):
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
-        if rank == 0 and shm:
+        if rank == source_rank and shm:
             shm.unlink()
     torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return is_in_the_same_node.sum().item() == world_size
+    return [x == 1 for x in is_in_the_same_node.tolist()]
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0cd420c8e11b5..b5cf6c45f478f 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -2,7 +2,7 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import Sequence
+from typing import Sequence, Tuple
 
 import torch
 
@@ -46,3 +46,19 @@ def split_tensor_along_last_dim(
         return tuple(chunk.contiguous() for chunk in tensor_list)
 
     return tensor_list
+
+
+def get_pp_indices(num_hidden_layers: int, pp_rank: int,
+                   pp_size: int) -> Tuple[int, int]:
+    """Try to evenly distribute layers across partitions.
+    If the number of layers is not divisible by the number of partitions,
+    the last partition will have the remaining layers.
+    """
+    layers_per_partition = num_hidden_layers // pp_size
+    start_layer = pp_rank * layers_per_partition
+    end_layer = start_layer + layers_per_partition
+
+    if pp_rank == pp_size - 1:
+        end_layer = num_hidden_layers
+
+    return (start_layer, end_layer)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ef31612420c94..05bfe7c24f978 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,17 +1,24 @@
 import argparse
 import dataclasses
 import json
-import warnings
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, TokenizerPoolConfig,
-                         VisionLanguageConfig)
+                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig, TokenizerPoolConfig)
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser, str_to_int_tuple
+from vllm.utils import FlexibleArgumentParser
+
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+        BaseTokenizerGroup)
+
+logger = init_logger(__name__)
 
 
 def nullable_str(val: str):
@@ -37,7 +44,11 @@ class EngineArgs:
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
-    distributed_executor_backend: Optional[str] = None
+    # Note: Specifying a custom executor backend by passing a class
+    # is intended for expert use only. The API may change without
+    # notice.
+    distributed_executor_backend: Optional[Union[str,
+                                                 Type[ExecutorBase]]] = None
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
@@ -46,6 +57,7 @@ class EngineArgs:
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = False
     swap_space: int = 4  # GiB
+    cpu_offload_gb: int = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
     max_num_seqs: int = 256
@@ -62,11 +74,17 @@ class EngineArgs:
     max_seq_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = False
     tokenizer_pool_size: int = 0
-    tokenizer_pool_type: str = "ray"
+    # Note: Specifying a tokenizer pool by passing a class
+    # is intended for expert use only. The API may change without
+    # notice.
+    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
     tokenizer_pool_extra_config: Optional[dict] = None
     enable_lora: bool = False
     max_loras: int = 1
     max_lora_rank: int = 16
+    enable_prompt_adapter: bool = False
+    max_prompt_adapters: int = 1
+    max_prompt_adapter_token: int = 0
     fully_sharded_loras: bool = False
     lora_extra_vocab_size: int = 256
     long_lora_scaling_factors: Optional[Tuple[float]] = None
@@ -77,30 +95,26 @@ class EngineArgs:
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
     model_loader_extra_config: Optional[dict] = None
+    ignore_patterns: Optional[Union[str, List[str]]] = None
     preemption_mode: Optional[str] = None
 
-    # Related to Vision-language models such as llava
-    image_input_type: Optional[str] = None
-    image_token_id: Optional[int] = None
-    image_input_shape: Optional[str] = None
-    image_feature_size: Optional[int] = None
-    image_processor: Optional[str] = None
-    image_processor_revision: Optional[str] = None
-    disable_image_processor: bool = False
-
     scheduler_delay_factor: float = 0.0
-    enable_chunked_prefill: bool = False
+    enable_chunked_prefill: Optional[bool] = None
 
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
+    speculative_draft_tensor_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
     speculative_disable_by_batch_size: Optional[int] = None
     ngram_prompt_lookup_max: Optional[int] = None
     ngram_prompt_lookup_min: Optional[int] = None
-
+    spec_decoding_acceptance_method: str = 'rejection_sampler'
+    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
+    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
     qlora_adapter_name_or_path: Optional[str] = None
+    disable_logprobs_during_spec_decoding: Optional[bool] = None
 
     otlp_traces_endpoint: Optional[str] = None
 
@@ -108,53 +122,6 @@ def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
 
-    @staticmethod
-    def add_cli_args_for_vlm(
-            parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument('--image-input-type',
-                            type=nullable_str,
-                            default=None,
-                            choices=[
-                                t.name.lower()
-                                for t in VisionLanguageConfig.ImageInputType
-                            ],
-                            help=('The image input type passed into vLLM.'))
-        parser.add_argument('--image-token-id',
-                            type=int,
-                            default=None,
-                            help=('Input id for image token.'))
-        parser.add_argument(
-            '--image-input-shape',
-            type=nullable_str,
-            default=None,
-            help=('The biggest image input shape (worst for memory footprint) '
-                  'given an input type. Only used for vLLM\'s profile_run.'))
-        parser.add_argument(
-            '--image-feature-size',
-            type=int,
-            default=None,
-            help=('The image feature size along the context dimension.'))
-        parser.add_argument(
-            '--image-processor',
-            type=str,
-            default=EngineArgs.image_processor,
-            help='Name or path of the huggingface image processor to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            '--image-processor-revision',
-            type=str,
-            default=None,
-            help='Revision of the huggingface image processor version to use. '
-            'It can be a branch name, a tag name, or a commit id. '
-            'If unspecified, will use the default version.')
-        parser.add_argument(
-            '--disable-image-processor',
-            action='store_true',
-            help='Disables the use of image processor, even if one is defined '
-            'for the model on huggingface.')
-
-        return parser
-
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
@@ -354,6 +321,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.swap_space,
                             help='CPU swap space size (GiB) per GPU.')
+        parser.add_argument(
+            '--cpu-offload-gb',
+            type=float,
+            default=0,
+            help='The space in GiB to offload to CPU, per GPU. '
+            'Default is 0, which means no offloading. Intuitively, '
+            'this argument can be seen as a virtual way to increase '
+            'the GPU memory size. For example, if you have one 24 GB '
+            'GPU and set this to 10, virtually you can think of it as '
+            'a 34 GB GPU. Then you can load a 13B model with BF16 weight,'
+            'which requires at least 26GB GPU memory. Note that this '
+            'requires fast CPU-GPU interconnect, as part of the model is'
+            'loaded from CPU memory to GPU memory on the fly in each '
+            'model forward pass.')
         parser.add_argument(
             '--gpu-memory-utilization',
             type=float,
@@ -503,15 +484,25 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
-        parser.add_argument(
-            "--device",
-            type=str,
-            default=EngineArgs.device,
-            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"],
-            help='Device type for vLLM execution.')
-
-        # Related to Vision-language models such as llava
-        parser = EngineArgs.add_cli_args_for_vlm(parser)
+        parser.add_argument('--enable-prompt-adapter',
+                            action='store_true',
+                            help='If True, enable handling of PromptAdapters.')
+        parser.add_argument('--max-prompt-adapters',
+                            type=int,
+                            default=EngineArgs.max_prompt_adapters,
+                            help='Max number of PromptAdapters in a batch.')
+        parser.add_argument('--max-prompt-adapter-token',
+                            type=int,
+                            default=EngineArgs.max_prompt_adapter_token,
+                            help='Max number of PromptAdapters tokens')
+        parser.add_argument("--device",
+                            type=str,
+                            default=EngineArgs.device,
+                            choices=[
+                                "auto", "cuda", "neuron", "cpu", "openvino",
+                                "tpu", "xpu"
+                            ],
+                            help='Device type for vLLM execution.')
 
         parser.add_argument(
             '--scheduler-delay-factor',
@@ -521,7 +512,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'prompt latency) before scheduling next prompt.')
         parser.add_argument(
             '--enable-chunked-prefill',
-            action='store_true',
+            action=StoreBoolean,
+            default=EngineArgs.enable_chunked_prefill,
+            nargs="?",
+            const="True",
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
 
@@ -537,6 +531,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-draft-tensor-parallel-size',
+            '-spec-draft-tp',
+            type=int,
+            default=EngineArgs.speculative_draft_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -567,6 +568,50 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Min size of window for ngram prompt lookup in speculative '
             'decoding.')
 
+        parser.add_argument(
+            '--spec-decoding-acceptance-method',
+            type=str,
+            default=EngineArgs.spec_decoding_acceptance_method,
+            choices=['rejection_sampler', 'typical_acceptance_sampler'],
+            help='Specify the acceptance method to use during draft token '
+            'verification in speculative decoding. Two types of acceptance '
+            'routines are supported: '
+            '1) RejectionSampler which does not allow changing the '
+            'acceptance rate of draft tokens, '
+            '2) TypicalAcceptanceSampler which is configurable, allowing for '
+            'a higher acceptance rate at the cost of lower quality, '
+            'and vice versa.')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-threshold',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
+            help='Set the lower bound threshold for the posterior '
+            'probability of a token to be accepted. This threshold is '
+            'used by the TypicalAcceptanceSampler to make sampling decisions '
+            'during speculative decoding. Defaults to 0.09')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-alpha',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
+            help='A scaling factor for the entropy-based threshold for token '
+            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
+            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
+            'i.e. 0.3')
+
+        parser.add_argument(
+            '--disable-logprobs-during-spec-decoding',
+            type=bool,
+            default=EngineArgs.disable_logprobs_during_spec_decoding,
+            help='If set to True, token log probabilities are not returned '
+            'during speculative decoding. If set to False, log probabilities '
+            'are returned according to the settings in SamplingParams. If '
+            'not specified, it defaults to True. Disabling log probabilities '
+            'during speculative decoding reduces latency by skipping logprob '
+            'calculation in proposal sampling, target sampling, and after '
+            'accepted tokens are determined.')
+
         parser.add_argument('--model-loader-extra-config',
                             type=nullable_str,
                             default=EngineArgs.model_loader_extra_config,
@@ -575,6 +620,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             'corresponding to the chosen load_format. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
+        parser.add_argument(
+            '--ignore-patterns',
+            action="append",
+            type=str,
+            default=[],
+            help="The pattern(s) to ignore when loading the model."
+            "Default to 'original/**/*' to avoid repeated loading of llama's "
+            "checkpoints.")
         parser.add_argument(
             '--preemption-mode',
             type=str,
@@ -636,6 +689,12 @@ def create_engine_config(self, ) -> EngineConfig:
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
 
+        assert self.cpu_offload_gb >= 0, (
+            "CPU offload space must be non-negative"
+            f", but got {self.cpu_offload_gb}")
+
+        multimodal_config = MultiModalConfig()
+
         device_config = DeviceConfig(device=self.device)
         model_config = ModelConfig(
             model=self.model,
@@ -658,7 +717,8 @@ def create_engine_config(self, ) -> EngineConfig:
             max_logprobs=self.max_logprobs,
             disable_sliding_window=self.disable_sliding_window,
             skip_tokenizer_init=self.skip_tokenizer_init,
-            served_model_name=self.served_model_name)
+            served_model_name=self.served_model_name,
+            multimodal_config=multimodal_config)
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -666,7 +726,9 @@ def create_engine_config(self, ) -> EngineConfig:
             cache_dtype=self.kv_cache_dtype,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=model_config.get_sliding_window(),
-            enable_prefix_caching=self.enable_prefix_caching)
+            enable_prefix_caching=self.enable_prefix_caching,
+            cpu_offload_gb=self.cpu_offload_gb,
+        )
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
@@ -681,11 +743,45 @@ def create_engine_config(self, ) -> EngineConfig:
             ray_workers_use_nsight=self.ray_workers_use_nsight,
             distributed_executor_backend=self.distributed_executor_backend)
 
+        max_model_len = model_config.max_model_len
+        use_long_context = max_model_len > 32768
+        if self.enable_chunked_prefill is None:
+            # If not explicitly set, enable chunked prefill by default for
+            # long context (> 32K) models. This is to avoid OOM errors in the
+            # initial memory profiling phase.
+            if use_long_context:
+                is_gpu = device_config.device_type == "cuda"
+                use_sliding_window = (model_config.get_sliding_window()
+                                      is not None)
+                use_spec_decode = self.speculative_model is not None
+                if (is_gpu and not use_sliding_window and not use_spec_decode
+                        and not self.enable_lora
+                        and not self.enable_prompt_adapter
+                        and not self.enable_prefix_caching):
+                    self.enable_chunked_prefill = True
+                    logger.warning(
+                        "Chunked prefill is enabled by default for models with "
+                        "max_model_len > 32K. Currently, chunked prefill might "
+                        "not work with some features or models. If you "
+                        "encounter any issues, please disable chunked prefill "
+                        "by setting --enable-chunked-prefill=False.")
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = False
+
+        if not self.enable_chunked_prefill and use_long_context:
+            logger.warning(
+                "The model has a long context length (%s). This may cause OOM "
+                "errors during the initial memory profiling phase, or result "
+                "in low performance due to small KV cache space. Consider "
+                "setting --max-model-len to a smaller value.", max_model_len)
+
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
+            speculative_draft_tensor_parallel_size = \
+                self.speculative_draft_tensor_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_by_batch_size=self.
             speculative_disable_by_batch_size,
@@ -694,6 +790,13 @@ def create_engine_config(self, ) -> EngineConfig:
             use_v2_block_manager=self.use_v2_block_manager,
             ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
             ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
+            draft_token_acceptance_method=\
+                self.spec_decoding_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=self.
+            typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=self.
+            typical_acceptance_sampler_posterior_alpha,
+            disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
         scheduler_config = SchedulerConfig(
@@ -730,38 +833,13 @@ def create_engine_config(self, ) -> EngineConfig:
             load_format=self.load_format,
             download_dir=self.download_dir,
             model_loader_extra_config=self.model_loader_extra_config,
+            ignore_patterns=self.ignore_patterns,
         )
 
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
-                raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
-
-            vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
-                image_token_id=self.image_token_id,
-                image_input_shape=str_to_int_tuple(self.image_input_shape),
-                image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
-            )
-        else:
-            vision_language_config = None
+        prompt_adapter_config = PromptAdapterConfig(
+            max_prompt_adapters=self.max_prompt_adapters,
+            max_prompt_adapter_token=self.max_prompt_adapter_token) \
+                                        if self.enable_prompt_adapter else None
 
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
@@ -783,11 +861,12 @@ def create_engine_config(self, ) -> EngineConfig:
             scheduler_config=scheduler_config,
             device_config=device_config,
             lora_config=lora_config,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             speculative_config=speculative_config,
             load_config=load_config,
             decoding_config=decoding_config,
             observability_config=observability_config,
+            prompt_adapter_config=prompt_adapter_config,
         )
 
 
@@ -796,7 +875,6 @@ class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
     engine_use_ray: bool = False
     disable_log_requests: bool = False
-    max_log_len: Optional[int] = None
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
@@ -810,15 +888,21 @@ def add_cli_args(parser: FlexibleArgumentParser,
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
-        parser.add_argument('--max-log-len',
-                            type=int,
-                            default=None,
-                            help='Max number of prompt characters or prompt '
-                            'ID numbers being printed in log.'
-                            '\n\nDefault: Unlimited')
         return parser
 
 
+class StoreBoolean(argparse.Action):
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values.lower() == "true":
+            setattr(namespace, self.dest, True)
+        elif values.lower() == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise ValueError(f"Invalid boolean value: {values}. "
+                             "Expected 'true' or 'false'.")
+
+
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
     return EngineArgs.add_cli_args(FlexibleArgumentParser())
@@ -827,7 +911,3 @@ def _engine_args_parser():
 def _async_engine_args_parser():
     return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
                                         async_args_only=True)
-
-
-def _vlm_engine_args_parser():
-    return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser())
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df25eb111e87f..16b7bc64a2849 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,23 +1,26 @@
 import asyncio
 import time
 from functools import partial
-from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
-                    Set, Tuple, Type, Union)
+from typing import (AsyncIterator, Callable, Dict, Iterable, List, Mapping,
+                    Optional, Set, Tuple, Type, Union)
 
 from transformers import PreTrainedTokenizer
 
 import vllm.envs as envs
-from vllm.config import DecodingConfig, ModelConfig
+from vllm.config import DecodingConfig, EngineConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
 from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.metrics import StatLoggerBase
+from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.inputs import LLMInputs, PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
@@ -128,7 +131,10 @@ def process_request_output(self,
         """Process a request output from the engine."""
         request_id = request_output.request_id
 
-        self._request_streams[request_id].put(request_output)
+        # Guard against a KeyError which can occur if the request was aborted
+        # while the output was generated
+        if (stream := self._request_streams.get(request_id)) is not None:
+            stream.put(request_output)
         if request_output.finished:
             if verbose:
                 logger.info("Finished request %s.", request_id)
@@ -145,7 +151,10 @@ def process_exception(self,
             logger.info("Finished request %s.", request_id)
         self.abort_request(request_id)
 
-    def add_request(self, request_id: str,
+    def add_request(self,
+                    request_id: str,
+                    *,
+                    verbose: bool = False,
                     **engine_add_request_kwargs) -> AsyncStream:
         """Add a request to be sent to the engine on the next background
         loop iteration."""
@@ -160,6 +169,9 @@ def add_request(self, request_id: str,
 
         self.new_requests_event.set()
 
+        if verbose:
+            logger.info("Added request %s.", request_id)
+
         return stream
 
     def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
@@ -211,7 +223,8 @@ class _AsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
     async def step_async(
-            self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+        self, virtual_engine: int
+    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -221,18 +234,22 @@ async def step_async(
         and updates the scheduler with the model outputs. Finally, it decodes
         the sequences and returns the newly generated results.
         """
-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        seq_group_metadata_list, scheduler_outputs = self.scheduler[
+            virtual_engine].schedule()
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
             execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
                 blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                virtual_engine=virtual_engine,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-            )
+                finished_requests_ids=finished_requests_ids)
             output = await self.model_executor.execute_model_async(
                 execute_model_req)
         else:
@@ -248,21 +265,18 @@ async def step_async(
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
-            # Stop the execute model loop in parallel workers until there are
-            # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
-            # the RPC thread in the workers so that they can process any other
-            # queued control plane messages, such as add/remove lora adapters.
-            await self.model_executor.stop_remote_worker_execution_loop_async()
-
         return request_outputs
 
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        """Stop the remote worker execution loop."""
+        await self.model_executor.stop_remote_worker_execution_loop_async()
+
     async def process_model_inputs_async(
         self,
         request_id: str,
         inputs: PromptInputs,
         lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> LLMInputs:
         if isinstance(inputs, str):
             inputs = {"prompt": inputs}
@@ -278,9 +292,17 @@ async def process_model_inputs_async(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        if prompt_adapter_request:
+            prompt_token_ids = [
+                0
+            ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + \
+                prompt_token_ids
+
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     async def add_request_async(
         self,
@@ -289,7 +311,8 @@ async def add_request_async(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -298,7 +321,10 @@ async def add_request_async(
             arrival_time = time.time()
 
         processed_inputs = await self.process_model_inputs_async(
-            request_id=request_id, inputs=inputs, lora_request=lora_request)
+            request_id=request_id,
+            inputs=inputs,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
         self._add_processed_request(
             request_id=request_id,
@@ -306,10 +332,13 @@ async def add_request_async(
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
             trace_headers=trace_headers,
         )
 
     async def check_health_async(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
 
@@ -330,8 +359,6 @@ class AsyncLLMEngine:
             async frontend will be executed in a separate process as the
             model workers.
         log_requests: Whether to log the requests.
-        max_log_len: Maximum number of prompt characters or prompt ID numbers
-            being printed in log.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
         *args: Arguments for :class:`LLMEngine`.
@@ -345,13 +372,11 @@ def __init__(self,
                  engine_use_ray: bool,
                  *args,
                  log_requests: bool = True,
-                 max_log_len: Optional[int] = None,
                  start_engine_loop: bool = True,
                  **kwargs) -> None:
         self.worker_use_ray = worker_use_ray
         self.engine_use_ray = engine_use_ray
         self.log_requests = log_requests
-        self.max_log_len = max_log_len
         self.engine = self._init_engine(*args, **kwargs)
 
         self.background_loop: Optional[asyncio.Future] = None
@@ -366,19 +391,19 @@ def __init__(self,
         self._request_tracker: RequestTracker
 
     @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: AsyncEngineArgs,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-    ) -> "AsyncLLMEngine":
-        """Creates an async LLM engine from the engine arguments."""
-        # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
+    def _get_executor_cls(
+            cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
-
-        if engine_config.device_config.device_type == "neuron":
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
+            if distributed_executor_backend.uses_ray:  # type: ignore
+                initialize_ray_cluster(engine_config.parallel_config)
+            executor_class = distributed_executor_backend
+        elif engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
         elif engine_config.device_config.device_type == "tpu":
@@ -389,6 +414,12 @@ def from_engine_args(
                 "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "openvino":
+            assert distributed_executor_backend is None, (
+                "Distributed execution is not supported with "
+                "the OpenVINO backend.")
+            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
+            executor_class = OpenVINOExecutorAsync
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend is None:
                 from vllm.executor.xpu_executor import XPUExecutorAsync
@@ -411,17 +442,37 @@ def from_engine_args(
         else:
             from vllm.executor.gpu_executor import GPUExecutorAsync
             executor_class = GPUExecutorAsync
+        return executor_class
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+    ) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_config = engine_args.create_engine_config()
+
+        if engine_args.engine_use_ray:
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
+
+        executor_class = cls._get_executor_cls(engine_config)
+
         # Create the async LLM engine.
         engine = cls(
-            distributed_executor_backend == "ray",
+            executor_class.uses_ray,
             engine_args.engine_use_ray,
             **engine_config.to_dict(),
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
-            max_log_len=engine_args.max_log_len,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
+            stat_loggers=stat_loggers,
         )
         return engine
 
@@ -448,11 +499,16 @@ def _error_callback(self, exc: Exception) -> None:
         self.set_errored(exc)
         self._request_tracker.propagate_exception(exc)
 
-    async def get_tokenizer(self) -> "PreTrainedTokenizer":
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> "PreTrainedTokenizer":
         if self.engine_use_ray:
-            return await self.engine.get_tokenizer.remote()  # type: ignore
-        else:
-            return self.engine.get_tokenizer()
+            return await self.engine.get_tokenizer.remote(  # type: ignore
+                lora_request)
+
+        return await (self.engine.get_tokenizer_group().
+                      get_lora_tokenizer_async(lora_request))
 
     def start_background_loop(self) -> None:
         """Start the background loop."""
@@ -481,7 +537,8 @@ def _init_engine(self, *args,
             # order of the arguments.
             cache_config = kwargs["cache_config"]
             parallel_config = kwargs["parallel_config"]
-            if parallel_config.tensor_parallel_size == 1:
+            if (parallel_config.tensor_parallel_size == 1
+                    and parallel_config.pipeline_parallel_size == 1):
                 num_gpus = cache_config.gpu_memory_utilization
             else:
                 num_gpus = 1
@@ -489,7 +546,7 @@ def _init_engine(self, *args,
                 self._engine_class).remote
         return engine_class(*args, **kwargs)
 
-    async def engine_step(self) -> bool:
+    async def engine_step(self, virtual_engine: int) -> bool:
         """Kick the engine to process the waiting requests.
 
         Returns True if there are in-progress requests."""
@@ -520,14 +577,16 @@ async def engine_step(self) -> bool:
         if self.engine_use_ray:
             request_outputs = await self.engine.step.remote()  # type: ignore
         else:
-            request_outputs = await self.engine.step_async()
+            request_outputs = await self.engine.step_async(virtual_engine)
 
         # Put the outputs into the corresponding streams.
+        finished = True
         for request_output in request_outputs:
             self._request_tracker.process_request_output(
                 request_output, verbose=self.log_requests)
+            finished = finished and request_output.finished
 
-        return len(request_outputs) > 0
+        return not finished
 
     async def _engine_abort(self, request_ids: Iterable[str]):
         if self.engine_use_ray:
@@ -536,18 +595,65 @@ async def _engine_abort(self, request_ids: Iterable[str]):
             self.engine.abort_request(request_ids)
 
     async def run_engine_loop(self):
-        has_requests_in_progress = False
+        if self.engine_use_ray:
+            pipeline_parallel_size = 1  # type: ignore
+        else:
+            pipeline_parallel_size = \
+                self.engine.parallel_config.pipeline_parallel_size
+        has_requests_in_progress = [False] * pipeline_parallel_size
         while True:
-            if not has_requests_in_progress:
+            if not any(has_requests_in_progress):
                 logger.debug("Waiting for new requests...")
+                # Stop the execute model loop in parallel workers until there
+                # are more requests to process. This avoids waiting
+                # indefinitely in torch.distributed ops which may otherwise
+                # timeout, and unblocks the RPC thread in the workers so that
+                # they can process any other queued control plane messages,
+                # such as add/remove lora adapters.
+                if self.engine_use_ray:
+                    await (self.engine.stop_remote_worker_execution_loop.
+                           remote()  # type: ignore
+                           )
+                else:
+                    await self.engine.stop_remote_worker_execution_loop_async()
                 await self._request_tracker.wait_for_new_requests()
                 logger.debug("Got new requests!")
+                requests_in_progress = [
+                    asyncio.create_task(self.engine_step(ve))
+                    for ve in range(pipeline_parallel_size)
+                ]
+                has_requests_in_progress = [True] * pipeline_parallel_size
 
             # Abort if iteration takes too long due to unrecoverable errors
             # (eg. NCCL timeouts).
             try:
                 async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
-                    has_requests_in_progress = await self.engine_step()
+                    done, _ = await asyncio.wait(
+                        requests_in_progress,
+                        return_when=asyncio.FIRST_COMPLETED)
+                    for _ in range(pipeline_parallel_size):
+                        await asyncio.sleep(0)
+                for task in done:
+                    result = task.result()
+                    virtual_engine = requests_in_progress.index(task)
+                    if self.engine_use_ray:
+                        has_unfinished_requests = (
+                            await (self.engine.
+                                   has_unfinished_requests_for_virtual_engine.
+                                   remote(  # type: ignore
+                                       virtual_engine)))
+                    else:
+                        has_unfinished_requests = (
+                            self.engine.
+                            has_unfinished_requests_for_virtual_engine(
+                                virtual_engine))
+                    if result or has_unfinished_requests:
+                        requests_in_progress[virtual_engine] = (
+                            asyncio.create_task(
+                                self.engine_step(virtual_engine)))
+                        has_requests_in_progress[virtual_engine] = True
+                    else:
+                        has_requests_in_progress[virtual_engine] = False
             except asyncio.TimeoutError as exc:
                 logger.error(
                     "Engine iteration timed out. This should never happen!")
@@ -562,29 +668,9 @@ async def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> AsyncStream:
-        if self.log_requests:
-            if isinstance(inputs, str):
-                shortened_prompt = inputs
-                shortened_token_ids = None
-            else:
-                shortened_prompt = inputs.get("prompt")
-                shortened_token_ids = inputs.get("prompt_token_ids")
-
-            max_log_len = self.max_log_len
-            if max_log_len is not None:
-                if shortened_prompt is not None:
-                    shortened_prompt = shortened_prompt[:max_log_len]
-                if shortened_token_ids is not None:
-                    shortened_token_ids = shortened_token_ids[:max_log_len]
-
-            logger.info(
-                "Received request %s: prompt: %r, "
-                "params: %s, prompt_token_ids: %s, "
-                "lora_request: %s.", request_id, shortened_prompt, params,
-                shortened_token_ids, lora_request)
-
         if not self.is_running:
             if self.start_engine_loop:
                 self.start_background_loop()
@@ -600,12 +686,13 @@ async def add_request(
 
         stream = self._request_tracker.add_request(
             request_id,
+            verbose=self.log_requests,
             inputs=inputs,
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         return stream
 
@@ -615,7 +702,8 @@ async def generate(
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -631,6 +719,8 @@ async def generate(
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
             trace_headers: OpenTelemetry trace headers.
+            prompt_adapter_request: Prompt Adapter request to use 
+                                            for generation, if any.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine
@@ -685,6 +775,7 @@ async def generate(
                 sampling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
         ):
             yield LLMEngine.validate_output(output, RequestOutput)
 
@@ -694,7 +785,7 @@ async def encode(
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
     ) -> AsyncIterator[EmbeddingRequestOutput]:
         """Generate outputs for a request from an embedding model.
 
@@ -772,7 +863,8 @@ async def _process_request(
         params: Union[SamplingParams, PoolingParams],
         *,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
@@ -785,6 +877,7 @@ async def _process_request(
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
+            prompt_adapter_request=prompt_adapter_request,
         )
 
         try:
@@ -871,3 +964,19 @@ async def is_tracing_enabled(self) -> bool:
             )
         else:
             return self.engine.is_tracing_enabled()
+
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if self.engine_use_ray:
+            ray.get(
+                self.engine.add_logger.remote(  # type: ignore
+                    logger_name=logger_name, logger=logger))
+        else:
+            self.engine.add_logger(logger_name=logger_name, logger=logger)
+
+    def remove_logger(self, logger_name: str) -> None:
+        if self.engine_use_ray:
+            ray.get(
+                self.engine.remove_logger.remote(  # type: ignore
+                    logger_name=logger_name))
+        else:
+            self.engine.remove_logger(logger_name=logger_name)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 75d417f525e3a..eabe3b23a9d58 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,31 +1,36 @@
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List,
+                    Mapping, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
-from transformers import GenerationConfig, PreTrainedTokenizer
+from transformers import PreTrainedTokenizer
 
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
-                         LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+import vllm.envs as envs
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
+                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics import StatLogger, Stats
+from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger,
+                                 StatLoggerBase, Stats)
 from vllm.engine.output_processor.interfaces import (
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import LLMInputs, PromptInputs
+from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                            PoolerOutput, SamplerOutput, Sequence,
@@ -33,6 +38,7 @@
                            SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
+from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -45,16 +51,18 @@
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
-def _load_generation_config_dict(model_config: ModelConfig):
-    try:
-        return GenerationConfig.from_pretrained(
-            model_config.model,
-            revision=model_config.revision,
-        ).to_diff_dict()
-    except OSError:
-        # Not found.
+def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
+    config = try_get_generation_config(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.revision,
+    )
+
+    if config is None:
         return {}
 
+    return config.to_diff_dict()
+
 
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
 
@@ -83,12 +91,14 @@ class LLMEngine:
         scheduler_config: The configuration related to the request scheduler.
         device_config: The configuration related to the device.
         lora_config (Optional): The configuration related to serving multi-LoRA.
-        vision_language_config (Optional): The configuration related to vision
-            language models.
+        multimodal_config (Optional): The configuration related to multimodal 
+            models.
         speculative_config (Optional): The configuration related to speculative
             decoding.
         executor_class: The model executor class for managing distributed
             execution.
+        prompt_adapter_config (Optional): The configuration related to serving 
+            prompt adapters.
         log_stats: Whether to log statistics.
         usage_context: Specified entry point, used for usage info collection.
     """
@@ -153,13 +163,15 @@ def __init__(
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
         observability_config: Optional[ObservabilityConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> None:
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
@@ -168,11 +180,13 @@ def __init__(
             "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
+            "pipeline_parallel_size=%d, "
             "disable_custom_all_reduce=%s, quantization=%s, "
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
             "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s)",
+            "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
+            "enable_prefix_caching=%s)",
             VLLM_VERSION,
             model_config.model,
             speculative_config,
@@ -189,6 +203,7 @@ def __init__(
             load_config.download_dir,
             load_config.load_format,
             parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size,
             parallel_config.disable_custom_all_reduce,
             model_config.quantization,
             model_config.enforce_eager,
@@ -199,19 +214,22 @@ def __init__(
             observability_config,
             model_config.seed,
             model_config.served_model_name,
+            scheduler_config.use_v2_block_manager,
+            cache_config.enable_prefix_caching,
         )
         # TODO(woosuk): Print more configs in debug mode.
 
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.speculative_config = speculative_config
         self.load_config = load_config
         self.decoding_config = decoding_config or DecodingConfig()
+        self.prompt_adapter_config = prompt_adapter_config
         self.observability_config = observability_config or ObservabilityConfig(
         )
         self.log_stats = log_stats
@@ -227,6 +245,9 @@ def __init__(
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
 
+        self.input_processor = INPUT_REGISTRY.create_input_processor(
+            self.model_config)
+
         self.model_executor = executor_class(
             model_config=model_config,
             cache_config=cache_config,
@@ -234,9 +255,10 @@ def __init__(
             scheduler_config=scheduler_config,
             device_config=device_config,
             lora_config=lora_config,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             speculative_config=speculative_config,
             load_config=load_config,
+            prompt_adapter_config=prompt_adapter_config,
         )
 
         if not self.model_config.embedding_mode:
@@ -264,11 +286,13 @@ def __init__(
                     "quantization":
                     model_config.quantization,
                     "kv_cache_dtype":
-                    cache_config.cache_dtype,
+                    str(cache_config.cache_dtype),
 
                     # Feature flags
                     "enable_lora":
                     bool(lora_config),
+                    "enable_prompt_adapter":
+                    bool(prompt_adapter_config),
                     "enable_prefix_caching":
                     cache_config.enable_prefix_caching,
                     "enforce_eager":
@@ -285,15 +309,29 @@ def __init__(
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
         # GPU and CPU blocks, which are profiled in the distributed executor.
-        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+        self.scheduler = [
+            Scheduler(scheduler_config, cache_config, lora_config,
+                      parallel_config.pipeline_parallel_size)
+            for _ in range(parallel_config.pipeline_parallel_size)
+        ]
 
         # Metric Logging.
         if self.log_stats:
-            self.stat_logger = StatLogger(
-                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.served_model_name),
-                max_model_len=self.model_config.max_model_len)
-            self.stat_logger.info("cache_config", self.cache_config)
+            if stat_loggers is not None:
+                self.stat_loggers = stat_loggers
+            else:
+                self.stat_loggers = {
+                    "logging":
+                    LoggingStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+                    "prometheus":
+                    PrometheusStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        labels=dict(model_name=model_config.served_model_name),
+                        max_model_len=self.model_config.max_model_len),
+                }
+                self.stat_loggers["prometheus"].info("cache_config",
+                                                     self.cache_config)
 
         self.tracer = None
         if self.observability_config.otlp_traces_endpoint:
@@ -339,19 +377,20 @@ def _initialize_kv_caches(self) -> None:
         self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
     @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: EngineArgs,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-    ) -> "LLMEngine":
-        """Creates an LLM engine from the engine arguments."""
-        # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
+    def _get_executor_cls(cls,
+                          engine_config: EngineConfig) -> Type[ExecutorBase]:
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
-
         # Initialize the cluster and specify the executor class.
-        if engine_config.device_config.device_type == "neuron":
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, ExecutorBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorBase. Got {distributed_executor_backend}.")
+            if distributed_executor_backend.uses_ray:  # type: ignore
+                initialize_ray_cluster(engine_config.parallel_config)
+            executor_class = distributed_executor_backend
+        elif engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutor
             executor_class = NeuronExecutor
         elif engine_config.device_config.device_type == "tpu":
@@ -360,6 +399,9 @@ def from_engine_args(
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "openvino":
+            from vllm.executor.openvino_executor import OpenVINOExecutor
+            executor_class = OpenVINOExecutor
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
@@ -375,18 +417,35 @@ def from_engine_args(
         elif distributed_executor_backend == "mp":
             from vllm.executor.multiproc_gpu_executor import (
                 MultiprocessingGPUExecutor)
+            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
+                "multiprocessing distributed executor backend does not "
+                "support VLLM_USE_RAY_SPMD_WORKER=1")
             executor_class = MultiprocessingGPUExecutor
         else:
             from vllm.executor.gpu_executor import GPUExecutor
             executor_class = GPUExecutor
+        return executor_class
 
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_config = engine_args.create_engine_config()
+        executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
             **engine_config.to_dict(),
             executor_class=executor_class,
             log_stats=not engine_args.disable_log_stats,
             usage_context=usage_context,
+            stat_loggers=stat_loggers,
         )
+
         return engine
 
     def __reduce__(self):
@@ -411,8 +470,11 @@ def get_tokenizer_group(
 
         return self.tokenizer
 
-    def get_tokenizer(self) -> "PreTrainedTokenizer":
-        return self.get_tokenizer_group().get_lora_tokenizer(None)
+    def get_tokenizer(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
 
     def get_tokenizer_for_seq(self,
                               sequence: Sequence) -> "PreTrainedTokenizer":
@@ -440,6 +502,9 @@ def _verify_args(self) -> None:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
+        if self.prompt_adapter_config:
+            self.prompt_adapter_config.verify_with_model_config(
+                self.model_config)
 
     def _get_eos_token_id(
             self, lora_request: Optional[LoRARequest]) -> Optional[int]:
@@ -457,7 +522,8 @@ def _add_processed_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
         lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+        trace_headers: Optional[Mapping[str, str]] = None,
     ) -> None:
         # Create the sequences.
         block_size = self.cache_config.block_size
@@ -465,7 +531,7 @@ def _add_processed_request(
         eos_token_id = self._get_eos_token_id(lora_request)
 
         seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
-                       lora_request)
+                       lora_request, prompt_adapter_request)
 
         # Create a SequenceGroup based on SamplingParams or PoolingParams
         if isinstance(params, SamplingParams):
@@ -476,7 +542,7 @@ def _add_processed_request(
                 arrival_time=arrival_time,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
-            )
+                prompt_adapter_request=prompt_adapter_request)
         elif isinstance(params, PoolingParams):
             seq_group = self._create_sequence_group_with_pooling(
                 request_id,
@@ -484,19 +550,28 @@ def _add_processed_request(
                 params,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
-            )
+                prompt_adapter_request=prompt_adapter_request)
         else:
             raise ValueError(
                 "Either SamplingParams or PoolingParams must be provided.")
 
-        # Add the sequence group to the scheduler.
-        self.scheduler.add_seq_group(seq_group)
+        # Add the sequence group to the scheduler with least unfinished seqs.
+        costs = [
+            scheduler.get_num_unfinished_seq_groups()
+            for scheduler in self.scheduler
+        ]
+        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
+        min_cost_scheduler.add_seq_group(seq_group)
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        self.model_executor.stop_remote_worker_execution_loop()
 
     def process_model_inputs(
         self,
         request_id: str,
         inputs: PromptInputs,
         lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> LLMInputs:
         if isinstance(inputs, str):
             inputs = {"prompt": inputs}
@@ -511,9 +586,16 @@ def process_model_inputs(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        if prompt_adapter_request:
+            prompt_token_ids = \
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens\
+                         + prompt_token_ids
+
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     def add_request(
         self,
@@ -522,7 +604,8 @@ def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -572,9 +655,11 @@ def add_request(
         if arrival_time is None:
             arrival_time = time.time()
 
-        processed_inputs = self.process_model_inputs(request_id=request_id,
-                                                     inputs=inputs,
-                                                     lora_request=lora_request)
+        processed_inputs = self.process_model_inputs(
+            request_id=request_id,
+            inputs=inputs,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
         self._add_processed_request(
             request_id=request_id,
@@ -582,6 +667,7 @@ def add_request(
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
             trace_headers=trace_headers,
         )
 
@@ -592,7 +678,8 @@ def _create_sequence_group_with_sampling(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
@@ -606,12 +693,9 @@ def _create_sequence_group_with_sampling(
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
         sampling_params = sampling_params.clone()
-        # Add the eos token id into the sampling_params to support min_tokens
-        # processing
-        if seq.eos_token_id is not None:
-            sampling_params.all_stop_token_ids.add(seq.eos_token_id)
+
         sampling_params.update_from_generation_config(
-            self.generation_config_fields)
+            self.generation_config_fields, seq.eos_token_id)
 
         # Create the sequence group.
         seq_group = SequenceGroup(
@@ -621,7 +705,7 @@ def _create_sequence_group_with_sampling(
             sampling_params=sampling_params,
             lora_request=lora_request,
             trace_headers=trace_headers,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         return seq_group
 
@@ -632,16 +716,19 @@ def _create_sequence_group_with_pooling(
         pooling_params: PoolingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> SequenceGroup:
         """Creates a SequenceGroup with PoolingParams."""
         # Defensive copy of PoolingParams, which are used by the pooler
         pooling_params = pooling_params.clone()
         # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  lora_request=lora_request,
-                                  pooling_params=pooling_params)
+        seq_group = SequenceGroup(
+            request_id=request_id,
+            seqs=[seq],
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            pooling_params=pooling_params,
+            prompt_adapter_request=prompt_adapter_request)
         return seq_group
 
     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
@@ -661,7 +748,8 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             >>> # abort the request
             >>> engine.abort_request(request_id)
         """
-        self.scheduler.abort_seq_group(request_id)
+        for scheduler in self.scheduler:
+            scheduler.abort_seq_group(request_id)
 
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
@@ -673,11 +761,20 @@ def get_decoding_config(self) -> DecodingConfig:
 
     def get_num_unfinished_requests(self) -> int:
         """Gets the number of unfinished requests."""
-        return self.scheduler.get_num_unfinished_seq_groups()
+        return sum(scheduler.get_num_unfinished_seq_groups()
+                   for scheduler in self.scheduler)
 
     def has_unfinished_requests(self) -> bool:
         """Returns True if there are unfinished requests."""
-        return self.scheduler.has_unfinished_seqs()
+        return any(scheduler.has_unfinished_seqs()
+                   for scheduler in self.scheduler)
+
+    def has_unfinished_requests_for_virtual_engine(
+            self, virtual_engine: int) -> bool:
+        """
+        Returns True if there are unfinished requests for the virtual engine.
+        """
+        return self.scheduler[virtual_engine].has_unfinished_seqs()
 
     def _process_sequence_group_outputs(
         self,
@@ -726,7 +823,8 @@ def _process_model_outputs(
                 self.output_processor.process_outputs(seq_group, outputs)
 
         # Free the finished sequence groups.
-        self.scheduler.free_finished_seq_groups()
+        for scheduler in self.scheduler:
+            scheduler.free_finished_seq_groups()
 
         # Create the outputs.
         request_outputs: List[Union[RequestOutput,
@@ -792,9 +890,16 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             >>>     if not (engine.has_unfinished_requests() or example_inputs):
             >>>         break
         """
-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        if self.parallel_config.pipeline_parallel_size > 1:
+            raise NotImplementedError(
+                "Pipeline parallelism is only supported through AsyncLLMEngine "
+                "as performance will be severely degraded otherwise.")
+        seq_group_metadata_list, scheduler_outputs = self.scheduler[
+            0].schedule()
 
         if not scheduler_outputs.is_empty():
+            finished_requests_ids = self.scheduler[
+                0].get_and_reset_finished_requests_ids()
             execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
@@ -802,7 +907,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-            )
+                finished_requests_ids=finished_requests_ids)
             output = self.model_executor.execute_model(
                 execute_model_req=execute_model_req)
         else:
@@ -818,7 +923,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
+        if not self.has_unfinished_requests():
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
             # torch.distributed ops which may otherwise timeout, and unblocks
@@ -828,14 +933,24 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
 
         return request_outputs
 
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if logger_name in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} already exists.")
+        self.stat_loggers[logger_name] = logger
+
+    def remove_logger(self, logger_name: str) -> None:
+        if logger_name not in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} does not exist.")
+        del self.stat_loggers[logger_name]
+
     def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
             model_output: Optional[List[SamplerOutput]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
-            self.stat_logger.log(
-                self._get_stats(scheduler_outputs, model_output))
+            for logger in self.stat_loggers.values():
+                logger.log(self._get_stats(scheduler_outputs, model_output))
 
     def _get_stats(
             self,
@@ -853,23 +968,28 @@ def _get_stats(
 
         # System State
         #   Scheduler State
-        num_running_sys = len(self.scheduler.running)
-        num_swapped_sys = len(self.scheduler.swapped)
-        num_waiting_sys = len(self.scheduler.waiting)
+        num_running_sys = sum(
+            len(scheduler.running) for scheduler in self.scheduler)
+        num_swapped_sys = sum(
+            len(scheduler.swapped) for scheduler in self.scheduler)
+        num_waiting_sys = sum(
+            len(scheduler.waiting) for scheduler in self.scheduler)
 
         # KV Cache Usage in %
         num_total_gpu = self.cache_config.num_gpu_blocks
         gpu_cache_usage_sys = 0.
         if num_total_gpu is not None:
-            num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks(
-            )
+            num_free_gpu = sum(
+                scheduler.block_manager.get_num_free_gpu_blocks()
+                for scheduler in self.scheduler)
             gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
 
         num_total_cpu = self.cache_config.num_cpu_blocks
         cpu_cache_usage_sys = 0.
         if num_total_cpu is not None and num_total_cpu > 0:
-            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
-            )
+            num_free_cpu = sum(
+                scheduler.block_manager.get_num_free_cpu_blocks()
+                for scheduler in self.scheduler)
             cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
 
         # Iteration stats
@@ -1009,7 +1129,22 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> Set[int]:
         return self.model_executor.list_loras()
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_executor.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> List[int]:
+        return self.model_executor.list_prompt_adapters()
+
     def check_health(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 027f5c7e73c2b..4ed7da2377111 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,21 +1,27 @@
 import time
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
 from typing import Dict, List, Optional, Protocol, Union
 
 import numpy as np
-from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
-                               disable_created_metrics)
+import prometheus_client
 
+from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 
+if ray is not None:
+    from ray.util import metrics as ray_metrics
+else:
+    ray_metrics = None
+
 if TYPE_CHECKING:
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 logger = init_logger(__name__)
 
-disable_created_metrics()
+prometheus_client.disable_created_metrics()
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
@@ -24,56 +30,55 @@
 # begin-metrics-definitions
 class Metrics:
     labelname_finish_reason = "finished_reason"
+    _gauge_cls = prometheus_client.Gauge
+    _counter_cls = prometheus_client.Counter
+    _histogram_cls = prometheus_client.Histogram
 
     def __init__(self, labelnames: List[str], max_model_len: int):
         # Unregister any existing vLLM collectors
-        for collector in list(REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                REGISTRY.unregister(collector)
+        self._unregister_vllm_metrics()
 
         # Config Information
-        self.info_cache_config = Info(
-            name='vllm:cache_config',
-            documentation='information of cache_config')
+        self._create_info_cache_config()
 
         # System stats
         #   Scheduler State
-        self.gauge_scheduler_running = Gauge(
+        self.gauge_scheduler_running = self._gauge_cls(
             name="vllm:num_requests_running",
             documentation="Number of requests currently running on GPU.",
             labelnames=labelnames)
-        self.gauge_scheduler_waiting = Gauge(
+        self.gauge_scheduler_waiting = self._gauge_cls(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames)
-        self.gauge_scheduler_swapped = Gauge(
+        self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
             documentation="Number of requests swapped to CPU.",
             labelnames=labelnames)
         #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = Gauge(
+        self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
-        self.gauge_cpu_cache_usage = Gauge(
+        self.gauge_cpu_cache_usage = self._gauge_cls(
             name="vllm:cpu_cache_usage_perc",
             documentation="CPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
 
         # Iteration stats
-        self.counter_num_preemption = Counter(
+        self.counter_num_preemption = self._counter_cls(
             name="vllm:num_preemptions_total",
             documentation="Cumulative number of preemption from the engine.",
             labelnames=labelnames)
-        self.counter_prompt_tokens = Counter(
+        self.counter_prompt_tokens = self._counter_cls(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames)
-        self.counter_generation_tokens = Counter(
+        self.counter_generation_tokens = self._counter_cls(
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        self.histogram_time_to_first_token = Histogram(
+        self.histogram_time_to_first_token = self._histogram_cls(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
             labelnames=labelnames,
@@ -81,7 +86,7 @@ def __init__(self, labelnames: List[str], max_model_len: int):
                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
             ])
-        self.histogram_time_per_output_token = Histogram(
+        self.histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
             documentation="Histogram of time per output token in seconds.",
             labelnames=labelnames,
@@ -92,58 +97,182 @@ def __init__(self, labelnames: List[str], max_model_len: int):
 
         # Request stats
         #   Latency
-        self.histogram_e2e_time_request = Histogram(
+        self.histogram_e2e_time_request = self._histogram_cls(
             name="vllm:e2e_request_latency_seconds",
             documentation="Histogram of end to end request latency in seconds.",
             labelnames=labelnames,
             buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
         #   Metadata
-        self.histogram_num_prompt_tokens_request = Histogram(
+        self.histogram_num_prompt_tokens_request = self._histogram_cls(
             name="vllm:request_prompt_tokens",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames,
             buckets=build_1_2_5_buckets(max_model_len),
         )
-        self.histogram_num_generation_tokens_request = Histogram(
-            name="vllm:request_generation_tokens",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.histogram_best_of_request = Histogram(
+        self.histogram_num_generation_tokens_request = \
+            self._histogram_cls(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                labelnames=labelnames,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        self.histogram_best_of_request = self._histogram_cls(
             name="vllm:request_params_best_of",
             documentation="Histogram of the best_of request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.histogram_n_request = Histogram(
+        self.histogram_n_request = self._histogram_cls(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.counter_request_success = Counter(
+        self.counter_request_success = self._counter_cls(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
+        # Speculatie decoding stats
+        self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
+            name="vllm:spec_decode_draft_acceptance_rate",
+            documentation="Speulative token acceptance rate.",
+            labelnames=labelnames)
+        self.gauge_spec_decode_efficiency = self._gauge_cls(
+            name="vllm:spec_decode_efficiency",
+            documentation="Speculative decoding system efficiency.",
+            labelnames=labelnames)
+        self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(
+            name="vllm:spec_decode_num_accepted_tokens_total",
+            documentation="Number of accepted tokens.",
+            labelnames=labelnames))
+        self.counter_spec_decode_num_draft_tokens = self._counter_cls(
+            name="vllm:spec_decode_num_draft_tokens_total",
+            documentation="Number of draft tokens.",
+            labelnames=labelnames)
+        self.counter_spec_decode_num_emitted_tokens = (self._counter_cls(
+            name="vllm:spec_decode_num_emitted_tokens_total",
+            documentation="Number of emitted tokens.",
+            labelnames=labelnames))
+
         # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = Gauge(
+        self.gauge_avg_prompt_throughput = self._gauge_cls(
             name="vllm:avg_prompt_throughput_toks_per_s",
             documentation="Average prefill throughput in tokens/s.",
             labelnames=labelnames,
         )
         # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = Gauge(
+        self.gauge_avg_generation_throughput = self._gauge_cls(
             name="vllm:avg_generation_throughput_toks_per_s",
             documentation="Average generation throughput in tokens/s.",
             labelnames=labelnames,
         )
 
+    def _create_info_cache_config(self) -> None:
+        # Config Information
+        self.info_cache_config = prometheus_client.Info(
+            name='vllm:cache_config',
+            documentation='information of cache_config')
+
+    def _unregister_vllm_metrics(self) -> None:
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)
+
 
 # end-metrics-definitions
 
 
+class _RayGaugeWrapper:
+    """Wraps around ray.util.metrics.Gauge to provide same API as
+    prometheus_client.Gauge"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self._gauge = ray_metrics.Gauge(name=name,
+                                        description=documentation,
+                                        tag_keys=labelnames_tuple)
+
+    def labels(self, **labels):
+        self._gauge.set_default_tags(labels)
+        return self
+
+    def set(self, value: Union[int, float]):
+        return self._gauge.set(value)
+
+
+class _RayCounterWrapper:
+    """Wraps around ray.util.metrics.Counter to provide same API as
+    prometheus_client.Counter"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self._counter = ray_metrics.Counter(name=name,
+                                            description=documentation,
+                                            tag_keys=labelnames_tuple)
+
+    def labels(self, **labels):
+        self._counter.set_default_tags(labels)
+        return self
+
+    def inc(self, value: Union[int, float] = 1.0):
+        if value == 0:
+            return
+        return self._counter.inc(value)
+
+
+class _RayHistogramWrapper:
+    """Wraps around ray.util.metrics.Histogram to provide same API as
+    prometheus_client.Histogram"""
+
+    def __init__(self,
+                 name: str,
+                 documentation: str = "",
+                 labelnames: Optional[List[str]] = None,
+                 buckets: Optional[List[float]] = None):
+        labelnames_tuple = tuple(labelnames) if labelnames else None
+        self._histogram = ray_metrics.Histogram(name=name,
+                                                description=documentation,
+                                                tag_keys=labelnames_tuple,
+                                                boundaries=buckets)
+
+    def labels(self, **labels):
+        self._histogram.set_default_tags(labels)
+        return self
+
+    def observe(self, value: Union[int, float]):
+        return self._histogram.observe(value)
+
+
+class RayMetrics(Metrics):
+    """
+    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
+    Provides the same metrics as Metrics but uses Ray's util.metrics library.
+    """
+    _gauge_cls = _RayGaugeWrapper
+    _counter_cls = _RayCounterWrapper
+    _histogram_cls = _RayHistogramWrapper
+
+    def __init__(self, labelnames: List[str], max_model_len: int):
+        if ray_metrics is None:
+            raise ImportError("RayMetrics requires Ray to be installed.")
+        super().__init__(labelnames, max_model_len)
+
+    def _unregister_vllm_metrics(self) -> None:
+        # No-op on purpose
+        pass
+
+    def _create_info_cache_config(self) -> None:
+        # No-op on purpose
+        pass
+
+
 def build_1_2_5_buckets(max_value: int) -> List[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by 
@@ -206,34 +335,136 @@ def metrics_info(self) -> Dict[str, str]:
         ...
 
 
-class StatLogger:
-    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
+def local_interval_elapsed(now: float, last_log: float,
+                           local_interval: float) -> bool:
+    elapsed_time = now - last_log
+    return elapsed_time > local_interval
 
-    def __init__(self, local_interval: float, labels: Dict[str, str],
-                 max_model_len: int) -> None:
-        # Metadata for logging locally.
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
 
+def get_throughput(tracked_stats: List[int], now: float,
+                   last_log: float) -> float:
+    return float(np.sum(tracked_stats) / (now - last_log))
+
+
+class StatLoggerBase(ABC):
+    """Base class for StatLogger."""
+
+    def __init__(self, local_interval: float) -> None:
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
+        self.last_local_log = time.time()
+        self.local_interval = local_interval
+
+    @abstractmethod
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def log(self, stats: Stats) -> None:
+        raise NotImplementedError
+
+
+class LoggingStatLogger(StatLoggerBase):
+    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
 
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to Stdout every self.local_interval seconds."""
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+
+        # Log locally every local_interval seconds.
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
+            # Log to stdout.
+            logger.info(
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+                "CPU KV cache usage: %.1f%%.",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running_sys,
+                stats.num_swapped_sys,
+                stats.num_waiting_sys,
+                stats.gpu_cache_usage_sys * 100,
+                stats.cpu_cache_usage_sys * 100,
+            )
+
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+
+            if stats.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        stats.spec_decode_metrics))
+
+    def _format_spec_decode_metrics_str(
+            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+
+        return ("Speculative metrics: "
+                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+                f"System efficiency: {metrics.system_efficiency:.3f}, "
+                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+                f"Number of draft tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens: {metrics.emitted_tokens}.")
+
+
+class PrometheusStatLogger(StatLoggerBase):
+    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
+    _metrics_cls = Metrics
+
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 max_model_len: int) -> None:
+        super().__init__(local_interval)
         # Prometheus metrics
         self.labels = labels
-        self.metrics = Metrics(labelnames=list(labels.keys()),
-                               max_model_len=max_model_len)
+        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
+                                         max_model_len=max_model_len)
 
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         if type == "cache_config":
             self.metrics.info_cache_config.info(obj.metrics_info())
 
-    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
-        return float(np.sum(tracked_stats) / (now - self.last_local_log))
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
 
-    def _local_interval_elapsed(self, now: float) -> bool:
-        elapsed_time = now - self.last_local_log
-        return elapsed_time > self.local_interval
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+
+    def _log_counter_labels(self, counter, data: CollectionsCounter,
+                            label_key: str) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+
+    def _log_histogram(self, histogram, data: Union[List[int],
+                                                    List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
 
     def _log_prometheus(self, stats: Stats) -> None:
         # System state data
@@ -279,26 +510,6 @@ def _log_prometheus(self, stats: Stats) -> None:
         self._log_histogram(self.metrics.histogram_best_of_request,
                             stats.best_of_requests)
 
-    def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
-                            label_key: str) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram: Histogram,
-                       data: Union[List[int], List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
         # Logs metrics to prometheus that are computed every logging_interval.
@@ -313,11 +524,8 @@ def _log_prometheus_interval(self, prompt_throughput: float,
         self.metrics.gauge_avg_generation_throughput.labels(
             **self.labels).set(generation_throughput)
 
-    def log(self, stats: Stats) -> None:
-        """Called by LLMEngine.
-           Logs to prometheus and tracked stats every iteration.
-           Logs to Stdout every self.local_interval seconds."""
-
+    def log(self, stats: Stats):
+        """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
         self._log_prometheus(stats)
 
@@ -326,50 +534,47 @@ def log(self, stats: Stats) -> None:
         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 
         # Log locally every local_interval seconds.
-        if self._local_interval_elapsed(stats.now):
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
             # Compute summary metrics for tracked stats (and log them
             # to promethus if applicable).
-            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
-                                                     now=stats.now)
-            generation_throughput = self._get_throughput(
-                self.num_generation_tokens, now=stats.now)
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
             self._log_prometheus_interval(
                 prompt_throughput=prompt_throughput,
                 generation_throughput=generation_throughput)
 
-            # Log to stdout.
-            logger.info(
-                "Avg prompt throughput: %.1f tokens/s, "
-                "Avg generation throughput: %.1f tokens/s, "
-                "Running: %d reqs, Swapped: %d reqs, "
-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%.",
-                prompt_throughput,
-                generation_throughput,
-                stats.num_running_sys,
-                stats.num_swapped_sys,
-                stats.num_waiting_sys,
-                stats.gpu_cache_usage_sys * 100,
-                stats.cpu_cache_usage_sys * 100,
-            )
-
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
             self.num_generation_tokens = []
             self.last_local_log = stats.now
 
             if stats.spec_decode_metrics is not None:
-                logger.info(
-                    self._format_spec_decode_metrics_str(
-                        stats.spec_decode_metrics))
+                self._log_gauge(
+                    self.metrics.gauge_spec_decode_draft_acceptance_rate,
+                    stats.spec_decode_metrics.draft_acceptance_rate)
+                self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
+                                stats.spec_decode_metrics.system_efficiency)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_accepted_tokens,
+                    stats.spec_decode_metrics.accepted_tokens)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_draft_tokens,
+                    stats.spec_decode_metrics.draft_tokens)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_emitted_tokens,
+                    stats.spec_decode_metrics.emitted_tokens)
+
+
+class RayPrometheusStatLogger(PrometheusStatLogger):
+    """RayPrometheusStatLogger uses Ray metrics instead."""
+    _metrics_cls = RayMetrics
 
-    def _format_spec_decode_metrics_str(
-            self, metrics: "SpecDecodeWorkerMetrics") -> str:
-
-        return ("Speculative metrics: "
-                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
-                f"System efficiency: {metrics.system_efficiency:.3f}, "
-                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
-                f"Number of accepted tokens: {metrics.accepted_tokens}, "
-                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
-                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        return None
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 9ddb6a3648b8c..92aecebe6ec38 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -27,7 +27,7 @@ class SequenceGroupOutputProcessor(ABC):
     def create_output_processor(
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: "StopChecker",
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 8512ff83e41cc..25d15df9f915d 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -34,7 +34,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
     def __init__(
         self,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: StopChecker,
@@ -141,4 +141,5 @@ def _process_seq_outputs(self, seq: Sequence,
                 break
 
         if seq.is_finished():
-            self.scheduler.free_seq(seq)
+            for scheduler in self.scheduler:
+                scheduler.free_seq(seq)
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 07a68c65a6dd8..59eb4bc439d1f 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -33,7 +33,7 @@ def __init__(
         self,
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         stop_checker: StopChecker,
     ):
@@ -60,14 +60,23 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         assert len(outputs) == 1, ("Single step should only has 1 output.")
         output = outputs[0]
         prompt_logprobs = output.prompt_logprobs
+
+        # If this is the first (or only) "chunk" of the prefill, we need
+        # to prepend None to the list of prompt logprobs. The reason for this
+        # is that for N prompt tokens, the Sampler will generate N-1 total
+        # prompt logprobs during prefill since the token at idx 0 will not
+        # have a logprob associated with it.
         if prompt_logprobs is not None:
+            if not seq_group.prompt_logprobs:
+                prompt_logprobs = [None] + prompt_logprobs
+                seq_group.prompt_logprobs = []
+
             if seq_group.sampling_params.detokenize and self.detokenizer:
                 self.detokenizer.decode_prompt_logprobs_inplace(
-                    seq_group, prompt_logprobs)
-            if not seq_group.prompt_logprobs:
-                # The first prompt token's logprob is None because it doesn't
-                # have tokens that are precedent.
-                seq_group.prompt_logprobs = [None]
+                    seq_group,
+                    prompt_logprobs,
+                    position_offset=len(seq_group.prompt_logprobs))
+
             seq_group.prompt_logprobs.extend(prompt_logprobs)
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
@@ -81,7 +90,11 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             for parent_seq in parent_seqs
         }
         for sample in samples:
-            parent_child_dict[sample.parent_seq_id].append(sample)
+            # Guard against a KeyError which can occur if the request was
+            # aborted while the output was generated
+            if (child_list :=
+                    parent_child_dict.get(sample.parent_seq_id)) is not None:
+                child_list.append(sample)
         # List of (child, parent)
         child_seqs: List[Tuple[Sequence, Sequence]] = []
 
@@ -95,7 +108,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                 # not be used in the future iterations.
                 parent.status = SequenceStatus.FINISHED_ABORTED
                 seq_group.remove(parent.seq_id)
-                self.scheduler.free_seq(parent)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(parent)
                 continue
             # Fork the parent sequence if there are multiple child samples.
             for child_sample in child_samples[:-1]:
@@ -133,7 +147,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                 if seq is not parent:
                     seq_group.add(seq)
                     if not seq.is_finished():
-                        self.scheduler.fork_seq(parent, seq)
+                        for scheduler in self.scheduler:
+                            scheduler.fork_seq(parent, seq)
 
             # Free the finished and selected parent sequences' memory in block
             # manager. Keep them in the sequence group as candidate output.
@@ -141,7 +156,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             # old sequences.
             for seq, parent in child_seqs:
                 if seq is parent and seq.is_finished():
-                    self.scheduler.free_seq(seq)
+                    for scheduler in self.scheduler:
+                        scheduler.free_seq(seq)
             return
 
         # Beam search case
@@ -226,13 +242,15 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             if seq is not parent:
                 seq_group.add(seq)
                 if not seq.is_finished():
-                    self.scheduler.fork_seq(parent, seq)
+                    for scheduler in self.scheduler:
+                        scheduler.fork_seq(parent, seq)
 
         # Free the finished and selected parent sequences' memory in block
         # manager. Keep them in the sequence group as candidate output.
         for seq, parent in selected_child_seqs:
             if seq is parent and seq.is_finished():
-                self.scheduler.free_seq(seq)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(seq)
 
         # Remove the unselected parent sequences from the sequence group and
         # free their memory in block manager.
@@ -241,7 +259,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                 # Remove the parent sequence if it is not selected for next
                 # iteration
                 seq_group.remove(seq.seq_id)
-                self.scheduler.free_seq(seq)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(seq)
 
     def _check_beam_search_early_stopping(
         self,
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index feb904c5a13c9..66941442c8c9c 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -16,10 +16,13 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 
+logger = init_logger("vllm.entrypoints.api_server")
+
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None
@@ -107,6 +110,14 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         engine_args, usage_context=UsageContext.API_SERVER)
 
     app.root_path = args.root_path
+
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
new file mode 100644
index 0000000000000..83abc40888137
--- /dev/null
+++ b/vllm/entrypoints/chat_utils.py
@@ -0,0 +1,190 @@
+import codecs
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Awaitable, Iterable, List, Optional, Union, cast, final
+
+# yapf conflicts with isort for this block
+# yapf: disable
+from openai.types.chat import ChatCompletionContentPartImageParam
+from openai.types.chat import (
+    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
+from openai.types.chat import ChatCompletionContentPartTextParam
+from openai.types.chat import (
+    ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
+# yapf: enable
+# pydantic needs the TypedDict from typing_extensions
+from pydantic import ConfigDict
+from transformers import PreTrainedTokenizer
+from typing_extensions import Required, TypedDict
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.utils import async_get_and_parse_image
+
+logger = init_logger(__name__)
+
+
+class CustomChatCompletionContentPartParam(TypedDict, total=False):
+    __pydantic_config__ = ConfigDict(extra="allow")  # type: ignore
+
+    type: Required[str]
+    """The type of the content part."""
+
+
+ChatCompletionContentPartParam = Union[OpenAIChatCompletionContentPartParam,
+                                       CustomChatCompletionContentPartParam]
+
+
+class CustomChatCompletionMessageParam(TypedDict, total=False):
+    """Enables custom roles in the Chat Completion API."""
+    role: Required[str]
+    """The role of the message's author."""
+
+    content: Union[str, List[ChatCompletionContentPartParam]]
+    """The contents of the message."""
+
+    name: str
+    """An optional name for the participant.
+
+    Provides the model information to differentiate between participants of the
+    same role.
+    """
+
+
+ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
+                                   CustomChatCompletionMessageParam]
+
+
+@final  # So that it should be compatible with Dict[str, str]
+class ConversationMessage(TypedDict):
+    role: str
+    content: str
+
+
+@dataclass(frozen=True)
+class ChatMessageParseResult:
+    messages: List[ConversationMessage]
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
+        default_factory=list)
+
+
+def load_chat_template(chat_template: Optional[str]) -> Optional[str]:
+    if chat_template is None:
+        return None
+    try:
+        with open(chat_template, "r") as f:
+            resolved_chat_template = f.read()
+    except OSError as e:
+        JINJA_CHARS = "{}\n"
+        if not any(c in chat_template for c in JINJA_CHARS):
+            msg = (f"The supplied chat template ({chat_template}) "
+                   f"looks like a file path, but it failed to be "
+                   f"opened. Reason: {e}")
+            raise ValueError(msg) from e
+
+        # If opening a file fails, set chat template to be args to
+        # ensure we decode so our escape are interpreted correctly
+        resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
+
+    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
+    return resolved_chat_template
+
+
+@lru_cache(maxsize=None)
+def _image_token_str(model_config: ModelConfig,
+                     tokenizer: PreTrainedTokenizer) -> Optional[str]:
+    # TODO: Let user specify how to insert image tokens into prompt
+    # (similar to chat template)
+    model_type = model_config.hf_config.model_type
+    if model_type == "phi3_v":
+        # Workaround since this token is not defined in the tokenizer
+        return "<|image_1|>"
+    if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"):
+        # These models do not use image tokens in the prompt
+        return None
+    if model_type.startswith("llava"):
+        return tokenizer.decode(model_config.hf_config.image_token_index)
+    if model_type == "chameleon":
+        return "<image>"
+    raise TypeError("Unknown model type: {model_type}")
+
+
+# TODO: Let user specify how to insert image tokens into prompt
+# (similar to chat template)
+def _get_full_image_text_prompt(image_token_str: str, text_prompt: str) -> str:
+    """Combine image and text prompts for vision language model"""
+
+    # NOTE: For now we assume all model architectures use the same
+    # image + text prompt format. This may change in the future.
+    return f"{image_token_str}\n{text_prompt}"
+
+
+def _parse_chat_message_content_parts(
+    role: str,
+    parts: Iterable[ChatCompletionContentPartParam],
+    model_config: ModelConfig,
+    tokenizer: PreTrainedTokenizer,
+) -> ChatMessageParseResult:
+    texts: List[str] = []
+    mm_futures: List[Awaitable[MultiModalDataDict]] = []
+
+    for part in parts:
+        part_type = part["type"]
+        if part_type == "text":
+            text = cast(ChatCompletionContentPartTextParam, part)["text"]
+            texts.append(text)
+        elif part_type == "image_url":
+            if len(mm_futures) > 0:
+                raise NotImplementedError(
+                    "Multiple 'image_url' input is currently not supported.")
+
+            image_url = cast(ChatCompletionContentPartImageParam,
+                             part)["image_url"]
+
+            if image_url.get("detail", "auto") != "auto":
+                logger.warning(
+                    "'image_url.detail' is currently not supported and "
+                    "will be ignored.")
+
+            image_future = async_get_and_parse_image(image_url["url"])
+            mm_futures.append(image_future)
+        else:
+            raise NotImplementedError(f"Unknown part type: {part_type}")
+
+    text_prompt = "\n".join(texts)
+
+    if mm_futures:
+        image_token_str = _image_token_str(model_config, tokenizer)
+        if image_token_str is not None:
+            if image_token_str in text_prompt:
+                logger.warning(
+                    "Detected image token string in the text prompt. "
+                    "Skipping prompt formatting.")
+            else:
+                text_prompt = _get_full_image_text_prompt(
+                    image_token_str=image_token_str,
+                    text_prompt=text_prompt,
+                )
+
+    messages = [ConversationMessage(role=role, content=text_prompt)]
+
+    return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
+
+
+def parse_chat_message_content(
+    message: ChatCompletionMessageParam,
+    model_config: ModelConfig,
+    tokenizer: PreTrainedTokenizer,
+) -> ChatMessageParseResult:
+    role = message["role"]
+    content = message.get("content")
+
+    if content is None:
+        return ChatMessageParseResult(messages=[], mm_futures=[])
+    if isinstance(content, str):
+        messages = [ConversationMessage(role=role, content=content)]
+        return ChatMessageParseResult(messages=messages, mm_futures=[])
+
+    return _parse_chat_message_content_parts(role, content, model_config,
+                                             tokenizer)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9e923493160ed..62309ed345b1d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -6,13 +6,13 @@
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
-from vllm.inputs import (PromptInputs, PromptStrictInputs, TextPrompt,
-                         TextTokensPrompt, TokensPrompt,
+from vllm.inputs import (PromptInputs, TextPrompt, TokensPrompt,
                          parse_and_batch_prompt)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
@@ -68,6 +68,10 @@ class LLM:
             when their `best_of` sampling parameters are larger than 1. If all
             requests will have `best_of=1`, you can safely set this to 0.
             Otherwise, too small values may cause out-of-memory (OOM) errors.
+        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
+            the model weights. This virtually increases the GPU memory space
+            you can use to hold the model weights, at the cost of CPU-GPU data
+            transfer for every forward pass.
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
@@ -113,6 +117,7 @@ def __init__(
         seed: int = 0,
         gpu_memory_utilization: float = 0.9,
         swap_space: int = 4,
+        cpu_offload_gb: float = 0,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: int = 8192,
@@ -121,6 +126,11 @@ def __init__(
     ) -> None:
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
+        removed_vision_keys = ("image_token_id", "image_feature_size",
+                               "image_input_shape", "image_input_type")
+        if any(k in kwargs for k in removed_vision_keys):
+            raise TypeError(
+                "There is no need to pass vision-related arguments anymore.")
         engine_args = EngineArgs(
             model=model,
             tokenizer=tokenizer,
@@ -135,6 +145,7 @@ def __init__(
             seed=seed,
             gpu_memory_utilization=gpu_memory_utilization,
             swap_space=swap_space,
+            cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
             max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
@@ -226,7 +237,7 @@ def generate(
     @overload
     def generate(
         self,
-        inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
+        inputs: Union[PromptInputs, Sequence[PromptInputs]],
         /,  # We may enable `inputs` keyword after removing the old API
         *,
         sampling_params: Optional[Union[SamplingParams,
@@ -243,13 +254,14 @@ def generate(
                       "instead.")
     def generate(
         self,
-        prompts: Union[Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
+        prompts: Union[Union[PromptInputs, Sequence[PromptInputs]],
                        Optional[Union[str, List[str]]]] = None,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -266,6 +278,8 @@ def generate(
                 prompts and it is paired one by one with the prompt.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for 
+                generation, if any.
 
         Returns:
             A list of `RequestOutput` objects containing the
@@ -287,9 +301,7 @@ def generate(
                 prompt_token_ids=prompt_token_ids,
             )
         else:
-            inputs = cast(
-                Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
-                prompts)
+            inputs = cast(Union[PromptInputs, Sequence[PromptInputs]], prompts)
 
         if sampling_params is None:
             # Use default sampling params.
@@ -299,7 +311,7 @@ def generate(
             inputs=inputs,
             params=sampling_params,
             lora_request=lora_request,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return LLMEngine.validate_outputs(outputs, RequestOutput)
@@ -368,7 +380,7 @@ def encode(
     @overload
     def encode(
         self,
-        inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
+        inputs: Union[PromptInputs, Sequence[PromptInputs]],
         /,  # We may enable `inputs` keyword after removing the old API
         *,
         pooling_params: Optional[Union[PoolingParams,
@@ -385,13 +397,14 @@ def encode(
                       "instead.")
     def encode(
         self,
-        prompts: Union[Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
+        prompts: Union[Union[PromptInputs, Sequence[PromptInputs]],
                        Optional[Union[str, List[str]]]] = None,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -401,12 +414,14 @@ def encode(
 
         Args:
             inputs: The inputs to the LLM. You may pass a sequence of inputs for
-                batch inference. See :class:`~vllm.inputs.PromptStrictInputs`
+                batch inference. See :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for 
+                generation, if any.
 
         Returns:
             A list of `EmbeddingRequestOutput` objects containing the
@@ -428,9 +443,7 @@ def encode(
                 prompt_token_ids=prompt_token_ids,
             )
         else:
-            inputs = cast(
-                Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
-                prompts)
+            inputs = cast(Union[PromptInputs, Sequence[PromptInputs]], prompts)
 
         if pooling_params is None:
             # Use default pooling params.
@@ -440,6 +453,7 @@ def encode(
             inputs=inputs,
             params=pooling_params,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -477,17 +491,11 @@ def _convert_v1_inputs(
         inputs: List[PromptInputs] = []
         for i in range(num_requests):
             if prompts is not None:
-                if prompt_token_ids is not None:
-                    item = TextTokensPrompt(
-                        prompt=prompts[i],
-                        prompt_token_ids=prompt_token_ids[i])
-                else:
-                    item = TextPrompt(prompt=prompts[i])
+                item = TextPrompt(prompt=prompts[i])
+            elif prompt_token_ids is not None:
+                item = TokensPrompt(prompt_token_ids=prompt_token_ids[i])
             else:
-                if prompt_token_ids is not None:
-                    item = TokensPrompt(prompt_token_ids=prompt_token_ids[i])
-                else:
-                    raise AssertionError
+                raise AssertionError
 
             inputs.append(item)
 
@@ -495,10 +503,11 @@ def _convert_v1_inputs(
 
     def _validate_and_add_requests(
         self,
-        inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
+        inputs: Union[PromptInputs, Sequence[PromptInputs]],
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> None:
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
@@ -521,19 +530,23 @@ def _validate_and_add_requests(
                 params[i] if isinstance(params, Sequence) else params,
                 lora_request=lora_request[i] if isinstance(
                     lora_request, Sequence) else lora_request,
-            )
+                prompt_adapter_request=prompt_adapter_request)
 
     def _add_request(
-        self,
-        inputs: PromptInputs,
-        params: Union[SamplingParams, PoolingParams],
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+            self,
+            inputs: PromptInputs,
+            params: Union[SamplingParams, PoolingParams],
+            lora_request: Optional[Union[List[LoRARequest],
+                                         LoRARequest]] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> None:
         request_id = str(next(self.request_counter))
-        self.llm_engine.add_request(request_id,
-                                    inputs,
-                                    params,
-                                    lora_request=lora_request)
+        self.llm_engine.add_request(
+            request_id,
+            inputs,
+            params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
     def _run_engine(
             self, *, use_tqdm: bool
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
new file mode 100644
index 0000000000000..091896e1c7a69
--- /dev/null
+++ b/vllm/entrypoints/logger.py
@@ -0,0 +1,41 @@
+from typing import List, Optional, Union
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+
+logger = init_logger(__name__)
+
+
+class RequestLogger:
+
+    def __init__(self, *, max_log_len: Optional[int]) -> None:
+        super().__init__()
+
+        self.max_log_len = max_log_len
+
+    def log_inputs(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[List[int]],
+        params: Optional[Union[SamplingParams, PoolingParams]],
+        lora_request: Optional[LoRARequest],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> None:
+        max_log_len = self.max_log_len
+        if max_log_len is not None:
+            if prompt is not None:
+                prompt = prompt[:max_log_len]
+
+            if prompt_token_ids is not None:
+                prompt_token_ids = prompt_token_ids[:max_log_len]
+
+        logger.info(
+            "Received request %s: prompt: %r, "
+            "params: %s, prompt_token_ids: %s, "
+            "lora_request: %s, prompt_adapter_request: %s.", request_id,
+            prompt, params, prompt_token_ids, lora_request,
+            prompt_adapter_request)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ea6275920c79d..931063d90566c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,7 +8,7 @@
 
 import fastapi
 import uvicorn
-from fastapi import Request
+from fastapi import APIRouter, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -18,23 +18,37 @@
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ChatCompletionResponse,
                                               CompletionRequest,
-                                              EmbeddingRequest, ErrorResponse)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              EmbeddingRequest, ErrorResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse)
+# yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from vllm.entrypoints.openai.serving_tokenization import (
+    OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
+engine: AsyncLLMEngine
+engine_args: AsyncEngineArgs
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
 openai_serving_embedding: OpenAIServingEmbedding
+openai_serving_tokenization: OpenAIServingTokenization
 
 logger = init_logger('vllm.entrypoints.openai.api_server')
 
@@ -57,47 +71,59 @@ async def _force_log():
     yield
 
 
-app = fastapi.FastAPI(lifespan=lifespan)
+router = APIRouter()
 
 
-def parse_args():
-    parser = make_arg_parser()
-    return parser.parse_args()
+def mount_metrics(app: fastapi.FastAPI):
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app())
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+    app.routes.append(metrics_route)
 
 
-# Add prometheus asgi middleware to route /metrics requests
-route = Mount("/metrics", make_asgi_app())
-# Workaround for 307 Redirect for /metrics
-route.path_regex = re.compile('^/metrics(?P<path>.*)$')
-app.routes.append(route)
-
-
-@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(_, exc):
-    err = openai_serving_chat.create_error_response(message=str(exc))
-    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
-
-
-@app.get("/health")
+@router.get("/health")
 async def health() -> Response:
     """Health check."""
     await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
 
 
-@app.get("/v1/models")
+@router.post("/tokenize")
+async def tokenize(request: TokenizeRequest):
+    generator = await openai_serving_tokenization.create_tokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, TokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
+@router.post("/detokenize")
+async def detokenize(request: DetokenizeRequest):
+    generator = await openai_serving_tokenization.create_detokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, DetokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
+@router.get("/v1/models")
 async def show_available_models():
-    models = await openai_serving_chat.show_available_models()
+    models = await openai_serving_completion.show_available_models()
     return JSONResponse(content=models.model_dump())
 
 
-@app.get("/version")
+@router.get("/version")
 async def show_version():
     ver = {"version": VLLM_VERSION}
     return JSONResponse(content=ver)
 
 
-@app.post("/v1/chat/completions")
+@router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     generator = await openai_serving_chat.create_chat_completion(
@@ -113,7 +139,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
         return JSONResponse(content=generator.model_dump())
 
 
-@app.post("/v1/completions")
+@router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
     generator = await openai_serving_completion.create_completion(
         request, raw_request)
@@ -127,7 +153,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-@app.post("/v1/embeddings")
+@router.post("/v1/embeddings")
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     generator = await openai_serving_embedding.create_embedding(
         request, raw_request)
@@ -138,8 +164,12 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-if __name__ == "__main__":
-    args = parse_args()
+def build_app(args):
+    app = fastapi.FastAPI(lifespan=lifespan)
+    app.include_router(router)
+    app.root_path = args.root_path
+
+    mount_metrics(app)
 
     app.add_middleware(
         CORSMiddleware,
@@ -149,6 +179,12 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         allow_headers=args.allowed_headers,
     )
 
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(_, exc):
+        err = openai_serving_chat.create_error_response(message=str(exc))
+        return JSONResponse(err.model_dump(),
+                            status_code=HTTPStatus.BAD_REQUEST)
+
     if token := envs.VLLM_API_KEY or args.api_key:
 
         @app.middleware("http")
@@ -174,6 +210,12 @@ async def authentication(request: Request, call_next):
             raise ValueError(f"Invalid middleware {middleware}. "
                              f"Must be a function or a class.")
 
+    return app
+
+
+def run_server(args, llm_engine=None):
+    app = build_app(args)
+
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
@@ -182,19 +224,12 @@ async def authentication(request: Request, call_next):
     else:
         served_model_names = [args.model]
 
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-
-    # Enforce pixel values as image input type for vision language models
-    # when serving with API server
-    if engine_args.image_input_type is not None and \
-        engine_args.image_input_type.upper() != "PIXEL_VALUES":
-        raise ValueError(
-            f"Invalid image_input_type: {engine_args.image_input_type}. "
-            "Only --image-input-type 'pixel_values' is supported for serving "
-            "vision language models with the vLLM API server.")
+    global engine, engine_args
 
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
 
     event_loop: Optional[asyncio.AbstractEventLoop]
     try:
@@ -210,16 +245,57 @@ async def authentication(request: Request, call_next):
         # When using single vLLM without engine_use_ray
         model_config = asyncio.run(engine.get_model_config())
 
-    openai_serving_chat = OpenAIServingChat(engine, model_config,
-                                            served_model_names,
-                                            args.response_role,
-                                            args.lora_modules,
-                                            args.chat_template)
+    if args.disable_log_requests:
+        request_logger = None
+    else:
+        request_logger = RequestLogger(max_log_len=args.max_log_len)
+
+    global openai_serving_chat
+    global openai_serving_completion
+    global openai_serving_embedding
+    global openai_serving_tokenization
+
+    openai_serving_chat = OpenAIServingChat(
+        engine,
+        model_config,
+        served_model_names,
+        args.response_role,
+        lora_modules=args.lora_modules,
+        prompt_adapters=args.prompt_adapters,
+        request_logger=request_logger,
+        chat_template=args.chat_template,
+    )
     openai_serving_completion = OpenAIServingCompletion(
-        engine, model_config, served_model_names, args.lora_modules)
-    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
-                                                      served_model_names)
+        engine,
+        model_config,
+        served_model_names,
+        lora_modules=args.lora_modules,
+        prompt_adapters=args.prompt_adapters,
+        request_logger=request_logger,
+    )
+    openai_serving_embedding = OpenAIServingEmbedding(
+        engine,
+        model_config,
+        served_model_names,
+        request_logger=request_logger,
+    )
+    openai_serving_tokenization = OpenAIServingTokenization(
+        engine,
+        model_config,
+        served_model_names,
+        lora_modules=args.lora_modules,
+        request_logger=request_logger,
+        chat_template=args.chat_template,
+    )
     app.root_path = args.root_path
+
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
@@ -229,3 +305,13 @@ async def authentication(request: Request, call_next):
                 ssl_certfile=args.ssl_certfile,
                 ssl_ca_certs=args.ssl_ca_certs,
                 ssl_cert_reqs=args.ssl_cert_reqs)
+
+
+if __name__ == "__main__":
+    # NOTE(simon):
+    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser = make_arg_parser(parser)
+    args = parser.parse_args()
+    run_server(args)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 59ad73bf097c8..64919c8be8642 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -9,7 +9,8 @@
 import ssl
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    PromptAdapterPath)
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -23,9 +24,17 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, lora_list)
 
 
-def make_arg_parser():
-    parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
+class PromptAdapterParserAction(argparse.Action):
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        adapter_list = []
+        for item in values:
+            name, path = item.split('=')
+            adapter_list.append(PromptAdapterPath(name, path))
+        setattr(namespace, self.dest, adapter_list)
+
+
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
                         type=nullable_str,
                         default=None,
@@ -65,6 +74,14 @@ def make_arg_parser():
         action=LoRAParserAction,
         help="LoRA module configurations in the format name=path. "
         "Multiple modules can be specified.")
+    parser.add_argument(
+        "--prompt-adapters",
+        type=nullable_str,
+        default=None,
+        nargs='+',
+        action=PromptAdapterParserAction,
+        help="Prompt adapter configurations in the format name=path. "
+        "Multiple adapters can be specified.")
     parser.add_argument("--chat-template",
                         type=nullable_str,
                         default=None,
@@ -113,4 +130,18 @@ def make_arg_parser():
         "using app.add_middleware(). ")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument('--max-log-len',
+                        type=int,
+                        default=None,
+                        help='Max number of prompt characters or prompt '
+                        'ID numbers being printed in log.'
+                        '\n\nDefault: Unlimited')
+
     return parser
+
+
+def create_parser_for_docs() -> FlexibleArgumentParser:
+    parser_for_docs = FlexibleArgumentParser(
+        prog="-m vllm.entrypoints.openai.api_server")
+    return make_arg_parser(parser_for_docs)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b57d79859aec5..c024bbc07c069 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3,50 +3,16 @@
 import time
 from typing import Any, Dict, List, Literal, Optional, Union
 
-import openai.types.chat
 import torch
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-# pydantic needs the TypedDict from typing_extensions
-from typing_extensions import Annotated, Required, TypedDict
+from typing_extensions import Annotated
 
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
 
-class CustomChatCompletionContentPartParam(TypedDict, total=False):
-    __pydantic_config__ = ConfigDict(extra="allow")  # type: ignore
-
-    type: Required[str]
-    """The type of the content part."""
-
-
-ChatCompletionContentPartParam = Union[
-    openai.types.chat.ChatCompletionContentPartParam,
-    CustomChatCompletionContentPartParam]
-
-
-class CustomChatCompletionMessageParam(TypedDict, total=False):
-    """Enables custom roles in the Chat Completion API."""
-    role: Required[str]
-    """The role of the message's author."""
-
-    content: Union[str, List[ChatCompletionContentPartParam]]
-    """The contents of the message."""
-
-    name: str
-    """An optional name for the participant.
-
-    Provides the model information to differentiate between participants of the
-    same role.
-    """
-
-
-ChatCompletionMessageParam = Union[
-    openai.types.chat.ChatCompletionMessageParam,
-    CustomChatCompletionMessageParam]
-
-
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does not allow extra fields
     model_config = ConfigDict(extra="forbid")
@@ -103,7 +69,8 @@ class ResponseFormat(OpenAIBaseModel):
 
 
 class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool]
+    include_usage: Optional[bool] = True
+    continuous_usage_stats: Optional[bool] = True
 
 
 class FunctionDefinition(OpenAIBaseModel):
@@ -154,47 +121,64 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: begin-chat-completion-sampling-params
     best_of: Optional[int] = None
-    use_beam_search: Optional[bool] = False
-    top_k: Optional[int] = -1
-    min_p: Optional[float] = 0.0
-    repetition_penalty: Optional[float] = 1.0
-    length_penalty: Optional[float] = 1.0
-    early_stopping: Optional[bool] = False
-    ignore_eos: Optional[bool] = False
-    min_tokens: Optional[int] = 0
+    use_beam_search: bool = False
+    top_k: int = -1
+    min_p: float = 0.0
+    repetition_penalty: float = 1.0
+    length_penalty: float = 1.0
+    early_stopping: bool = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-    skip_special_tokens: Optional[bool] = True
-    spaces_between_special_tokens: Optional[bool] = True
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     # doc: end-chat-completion-sampling-params
 
     # doc: begin-chat-completion-extra-params
-    echo: Optional[bool] = Field(
+    echo: bool = Field(
         default=False,
         description=(
             "If true, the new message will be prepended with the last message "
             "if they belong to the same role."),
     )
-    add_generation_prompt: Optional[bool] = Field(
+    add_generation_prompt: bool = Field(
         default=True,
         description=
         ("If true, the generation prompt will be added to the chat template. "
          "This is a parameter used by chat template in tokenizer config of the "
          "model."),
     )
-    add_special_tokens: Optional[bool] = Field(
+    add_special_tokens: bool = Field(
         default=False,
         description=(
             "If true, special tokens (e.g. BOS) will be added to the prompt "
             "on top of what is added by the chat template. "
             "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to False (as is the "
+            "special tokens so this should be set to false (as is the "
             "default)."),
     )
-    include_stop_str_in_output: Optional[bool] = Field(
-        default=False,
+    documents: Optional[List[Dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
         description=(
-            "Whether to include the stop string in the output. "
-            "This is only applied when the stop or stop_token_ids is set."),
+            "A Jinja template to use for this conversion. "
+            "If this is not passed, the model's default chat template will be "
+            "used instead."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
     )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
@@ -234,37 +218,44 @@ def to_sampling_params(self) -> SamplingParams:
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer before we add to LLMEngine
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
 
         return SamplingParams(
             n=self.n,
+            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=self.repetition_penalty,
             temperature=self.temperature,
             top_p=self.top_p,
+            top_k=self.top_k,
             min_p=self.min_p,
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
-            max_tokens=self.max_tokens,
-            min_tokens=self.min_tokens,
             logprobs=self.top_logprobs if self.logprobs else None,
             prompt_logprobs=self.top_logprobs if self.echo else None,
-            best_of=self.best_of,
-            top_k=self.top_k,
             ignore_eos=self.ignore_eos,
+            max_tokens=self.max_tokens,
+            min_tokens=self.min_tokens,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
             skip_special_tokens=self.skip_special_tokens,
@@ -272,6 +263,7 @@ def logit_bias_logits_processor(
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
         )
 
     @model_validator(mode='before')
@@ -353,26 +345,27 @@ class CompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # doc: begin-completion-sampling-params
-    use_beam_search: Optional[bool] = False
-    top_k: Optional[int] = -1
-    min_p: Optional[float] = 0.0
-    repetition_penalty: Optional[float] = 1.0
-    length_penalty: Optional[float] = 1.0
-    early_stopping: Optional[bool] = False
+    use_beam_search: bool = False
+    top_k: int = -1
+    min_p: float = 0.0
+    repetition_penalty: float = 1.0
+    length_penalty: float = 1.0
+    early_stopping: bool = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-    ignore_eos: Optional[bool] = False
-    min_tokens: Optional[int] = 0
-    skip_special_tokens: Optional[bool] = True
-    spaces_between_special_tokens: Optional[bool] = True
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
-    include_stop_str_in_output: Optional[bool] = Field(
-        default=False,
+    add_special_tokens: bool = Field(
+        default=True,
         description=(
-            "Whether to include the stop string in the output. "
-            "This is only applied when the stop or stop_token_ids is set."),
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
     )
     response_format: Optional[ResponseFormat] = Field(
         default=None,
@@ -419,15 +412,22 @@ def to_sampling_params(self):
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
@@ -445,15 +445,15 @@ def logit_bias_logits_processor(
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
+            logprobs=self.logprobs,
             ignore_eos=self.ignore_eos,
             max_tokens=self.max_tokens if not echo_without_generation else 1,
             min_tokens=self.min_tokens,
-            logprobs=self.logprobs,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
             prompt_logprobs=self.logprobs if self.echo else None,
             skip_special_tokens=self.skip_special_tokens,
-            spaces_between_special_tokens=(self.spaces_between_special_tokens),
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
@@ -487,11 +487,11 @@ def check_logprobs(cls, data):
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
             raise ValueError(
-                "Stream options can only be defined when stream is True.")
+                "Stream options can only be defined when stream is true.")
         return data
 
 
-class EmbeddingRequest(BaseModel):
+class EmbeddingRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
     model: str
@@ -563,13 +563,13 @@ class CompletionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
-class EmbeddingResponseData(BaseModel):
+class EmbeddingResponseData(OpenAIBaseModel):
     index: int
     object: str = "embedding"
-    embedding: List[float]
+    embedding: Union[List[float], str]
 
 
-class EmbeddingResponse(BaseModel):
+class EmbeddingResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
@@ -668,8 +668,8 @@ class BatchRequestInput(OpenAIBaseModel):
     # /v1/chat/completions is supported.
     url: str
 
-    # The parameteters of the request.
-    body: Union[ChatCompletionRequest, ]
+    # The parameters of the request.
+    body: ChatCompletionRequest
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -680,7 +680,7 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Union[ChatCompletionResponse, ]
+    body: Optional[ChatCompletionResponse] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
@@ -699,3 +699,36 @@ class BatchRequestOutput(OpenAIBaseModel):
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
     error: Optional[Any]
+
+
+class TokenizeCompletionRequest(OpenAIBaseModel):
+    model: str
+    prompt: str
+
+    add_special_tokens: bool = Field(default=True)
+
+
+class TokenizeChatRequest(OpenAIBaseModel):
+    model: str
+    messages: List[ChatCompletionMessageParam]
+
+    add_generation_prompt: bool = Field(default=True)
+    add_special_tokens: bool = Field(default=False)
+
+
+TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
+
+
+class TokenizeResponse(OpenAIBaseModel):
+    count: int
+    max_model_len: int
+    tokens: List[int]
+
+
+class DetokenizeRequest(OpenAIBaseModel):
+    model: str
+    tokens: List[int]
+
+
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 488ac89710b6d..3c5e5e651b54d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,5 +1,4 @@
 import asyncio
-import sys
 from io import StringIO
 from typing import Awaitable, List
 
@@ -7,6 +6,7 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchRequestOutput,
                                               BatchResponseData,
@@ -45,9 +45,17 @@ def parse_args():
                         type=nullable_str,
                         default="assistant",
                         help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
+                        "`request.add_generation_prompt=True`.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument('--max-log-len',
+                        type=int,
+                        default=None,
+                        help='Max number of prompt characters or prompt '
+                        'ID numbers being printed in log.'
+                        '\n\nDefault: Unlimited')
+
     return parser.parse_args()
 
 
@@ -115,11 +123,20 @@ async def main(args):
     # When using single vLLM without engine_use_ray
     model_config = await engine.get_model_config()
 
+    if args.disable_log_requests:
+        request_logger = None
+    else:
+        request_logger = RequestLogger(max_log_len=args.max_log_len)
+
     openai_serving_chat = OpenAIServingChat(
         engine,
         model_config,
         served_model_names,
         args.response_role,
+        lora_modules=None,
+        prompt_adapters=None,
+        request_logger=request_logger,
+        chat_template=None,
     )
 
     # Submit all requests in the file to the engine "concurrently".
@@ -137,9 +154,6 @@ async def main(args):
     output_buffer.seek(0)
     await write_file(args.output_file, output_buffer.read().strip())
 
-    # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
-    sys.exit(0)
-
 
 if __name__ == "__main__":
     args = parse_args()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 744e1d94511b3..b21c2bc513186 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,34 +1,33 @@
-import codecs
 import time
-from dataclasses import dataclass, field
-from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
-                    List, Optional)
+from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, List,
+                    Optional)
 from typing import Sequence as GenericSequence
-from typing import TypedDict, Union, cast, final
+from typing import Union
 
 from fastapi import Request
-from openai.types.chat import (ChatCompletionContentPartImageParam,
-                               ChatCompletionContentPartTextParam)
+from transformers import PreTrainedTokenizer
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.chat_utils import (ConversationMessage,
+                                         load_chat_template,
+                                         parse_chat_message_content)
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    ChatCompletionContentPartParam, ChatCompletionLogProb,
-    ChatCompletionLogProbs, ChatCompletionLogProbsContent,
-    ChatCompletionMessageParam, ChatCompletionNamedToolChoiceParam,
+    ChatCompletionLogProb, ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
     FunctionCall, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
-                                                    OpenAIServing)
+                                                    OpenAIServing,
+                                                    PromptAdapterPath)
 from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import (async_get_and_parse_image,
-                                   get_full_image_text_prompt)
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -38,156 +37,31 @@
 logger = init_logger(__name__)
 
 
-@final  # So that it should be compatible with Dict[str, str]
-class ConversationMessage(TypedDict):
-    role: str
-    content: str
-
-
-@dataclass(frozen=True)
-class ChatMessageParseResult:
-    messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImagePixelData]] = field(
-        default_factory=list)
-
-
 class OpenAIServingChat(OpenAIServing):
 
-    def __init__(self,
-                 engine: AsyncLLMEngine,
-                 model_config: ModelConfig,
-                 served_model_names: List[str],
-                 response_role: str,
-                 lora_modules: Optional[List[LoRAModulePath]] = None,
-                 chat_template: Optional[str] = None):
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        response_role: str,
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]],
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+    ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
-                         lora_modules=lora_modules)
+                         lora_modules=lora_modules,
+                         prompt_adapters=prompt_adapters,
+                         request_logger=request_logger)
 
         self.response_role = response_role
-        self._load_chat_template(chat_template)
 
-    def _load_chat_template(self, chat_template: Optional[str]):
-        tokenizer = self.tokenizer
-
-        if chat_template is not None:
-            try:
-                with open(chat_template, "r") as f:
-                    tokenizer.chat_template = f.read()
-            except OSError as e:
-                JINJA_CHARS = "{}\n"
-                if not any(c in chat_template for c in JINJA_CHARS):
-                    msg = (f"The supplied chat template ({chat_template}) "
-                           f"looks like a file path, but it failed to be "
-                           f"opened. Reason: {e}")
-                    raise ValueError(msg) from e
-
-                # If opening a file fails, set chat template to be args to
-                # ensure we decode so our escape are interpreted correctly
-                tokenizer.chat_template = codecs.decode(
-                    chat_template, "unicode_escape")
-
-            logger.info("Using supplied chat template:\n%s",
-                        tokenizer.chat_template)
-        elif tokenizer.chat_template is not None:
-            logger.info("Using default chat template:\n%s",
-                        tokenizer.chat_template)
-        else:
-            logger.warning(
-                "No chat template provided. Chat API will not work.")
-
-    def _parse_chat_message_content_parts(
-        self,
-        role: str,
-        parts: Iterable[ChatCompletionContentPartParam],
-    ) -> ChatMessageParseResult:
-        texts: List[str] = []
-        image_futures: List[Awaitable[ImagePixelData]] = []
-
-        vlm_config: Optional[VisionLanguageConfig] = getattr(
-            self.engine.engine, "vision_language_config", None)
-        model_config = getattr(self.engine.engine, "model_config", None)
-
-        for part in parts:
-            part_type = part["type"]
-            if part_type == "text":
-                text = cast(ChatCompletionContentPartTextParam, part)["text"]
-
-                texts.append(text)
-            elif part_type == "image_url":
-                if vlm_config is None:
-                    raise ValueError(
-                        "'image_url' input is not supported as the loaded "
-                        "model is not multimodal.")
-
-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
-
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
-
-                else:
-                    raise NotImplementedError(
-                        "Multiple 'image_url' input is currently not supported."
-                    )
-
-            else:
-                raise NotImplementedError(f"Unknown part type: {part_type}")
-
-        text_prompt = "\n".join(texts)
-
-        if vlm_config is not None and len(image_futures):
-
-            (image_token_prompt,
-             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
-
-            # NOTE: If image token string (e.g, <image>) is already present
-            # in the text prompt, we assume it follows the same format required
-            # by the engine.
-            if image_token_str in text_prompt:
-                logger.warning(
-                    "Detected image token string in the text prompt. "
-                    "Skipping prompt formatting.")
-                messages = [
-                    ConversationMessage(role=role, content=text_prompt)
-                ]
-
-            else:
-                full_prompt = get_full_image_text_prompt(
-                    image_prompt=image_token_prompt,
-                    text_prompt=text_prompt,
-                    config=model_config)
-                messages = [
-                    ConversationMessage(role=role, content=full_prompt)
-                ]
-        else:
-            messages = [ConversationMessage(role=role, content=text_prompt)]
-
-        return ChatMessageParseResult(messages=messages,
-                                      image_futures=image_futures)
-
-    def _parse_chat_message_content(
-        self,
-        message: ChatCompletionMessageParam,
-    ) -> ChatMessageParseResult:
-        role = message["role"]
-        content = message.get("content")
-
-        if content is None:
-            return ChatMessageParseResult(messages=[], image_futures=[])
-        if isinstance(content, str):
-            messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, image_futures=[])
-
-        return self._parse_chat_message_content_parts(role, content)
+        # If this is None we use the tokenizer's default chat template
+        self.chat_template = load_chat_template(chat_template)
 
     async def create_chat_completion(
         self,
@@ -209,90 +83,118 @@ async def create_chat_completion(
             return error_check_ret
 
         try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            model_config = self.model_config
+            tokenizer = await self.engine.get_tokenizer(lora_request)
+
             conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImagePixelData]] = []
+            mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
             for msg in request.messages:
-                chat_parsed_result = self._parse_chat_message_content(msg)
+                chat_parsed_result = parse_chat_message_content(
+                    msg, model_config, tokenizer)
 
                 conversation.extend(chat_parsed_result.messages)
-                image_futures.extend(chat_parsed_result.image_futures)
+                mm_futures.extend(chat_parsed_result.mm_futures)
 
-            prompt = self.tokenizer.apply_chat_template(
+            tool_dicts = None if request.tools is None else [
+                tool.model_dump() for tool in request.tools
+            ]
+
+            prompt = tokenizer.apply_chat_template(
                 conversation=conversation,
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
+                tools=tool_dicts,
+                documents=request.documents,
+                chat_template=request.chat_template or self.chat_template,
+                **(request.chat_template_kwargs or {}),
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
-        # Fetch image data
-        image_data: Optional[ImagePixelData] = None
+        mm_data: Optional[MultiModalDataDict] = None
         try:
-            if len(image_futures):
-                # since we support only single image currently
-                assert len(image_futures) == 1
-                image_data = await image_futures[0]
+            if len(mm_futures):
+                # since we support only single mm data currently
+                assert len(
+                    mm_futures
+                ) == 1, "Multiple 'image_url' input is currently not supported."
+                mm_data = await mm_futures[0]
         except Exception as e:
-            logger.error("Error in loading image data: %s", e)
+            logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
 
-        request_id = f"cmpl-{random_uuid()}"
+        request_id = f"chat-{random_uuid()}"
         try:
-            # Tokenize/detokenize depending on prompt format (string/token list)
-            prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
-                request,
-                prompt=prompt,
-                add_special_tokens=request.add_special_tokens)
             sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
             guided_decode_logits_processor = (
-                await get_guided_decoding_logits_processor(
-                    guided_decoding_backend, request, await
-                    self.engine.get_tokenizer()))
+                await
+                get_guided_decoding_logits_processor(guided_decoding_backend,
+                                                     request, tokenizer))
             if guided_decode_logits_processor:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
                 sampling_params.logits_processors.append(
                     guided_decode_logits_processor)
+
+            prompt_inputs = self._tokenize_prompt_input(
+                request,
+                tokenizer,
+                prompt,
+                truncate_prompt_tokens=sampling_params.truncate_prompt_tokens,
+                add_special_tokens=request.add_special_tokens,
+            )
+
+            self._log_inputs(request_id,
+                             prompt_inputs,
+                             params=sampling_params,
+                             lora_request=lora_request,
+                             prompt_adapter_request=prompt_adapter_request)
+
+            engine_inputs: PromptInputs = {
+                "prompt_token_ids": prompt_inputs["prompt_token_ids"],
+            }
+            if mm_data is not None:
+                engine_inputs["multi_modal_data"] = mm_data
+
+            is_tracing_enabled = await self.engine.is_tracing_enabled()
+            trace_headers = None
+            if is_tracing_enabled and raw_request:
+                trace_headers = extract_trace_headers(raw_request.headers)
+            if (not is_tracing_enabled and raw_request
+                    and contains_trace_headers(raw_request.headers)):
+                log_tracing_disabled_warning()
+
+            result_generator = self.engine.generate(
+                engine_inputs,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
+            )
         except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        inputs: PromptInputs = {
-            "prompt": prompt_text,
-            "prompt_token_ids": prompt_ids,
-        }
-        if image_data is not None:
-            inputs["multi_modal_data"] = image_data
-
-        is_tracing_enabled = await self.engine.is_tracing_enabled()
-        trace_headers = None
-        if is_tracing_enabled and raw_request:
-            trace_headers = extract_trace_headers(raw_request.headers)
-        if not is_tracing_enabled and raw_request and contains_trace_headers(
-                raw_request.headers):
-            log_tracing_disabled_warning()
-
-        result_generator = self.engine.generate(
-            inputs,
-            sampling_params,
-            request_id,
-            lora_request,
-            trace_headers=trace_headers,
-        )
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id, conversation)
+                request, result_generator, request_id, conversation, tokenizer)
         else:
             try:
                 return await self.chat_completion_full_generator(
                     request, raw_request, result_generator, request_id,
-                    conversation)
+                    conversation, tokenizer)
             except ValueError as e:
                 # TODO: Use a vllm-specific Validation Error
                 return self.create_error_response(str(e))
@@ -304,9 +206,12 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
             return request.messages[-1]["role"]
 
     async def chat_completion_stream_generator(
-            self, request: ChatCompletionRequest,
-            result_generator: AsyncIterator[RequestOutput], request_id: str,
-            conversation: List[ConversationMessage]
+        self,
+        request: ChatCompletionRequest,
+        result_generator: AsyncIterator[RequestOutput],
+        request_id: str,
+        conversation: List[ConversationMessage],
+        tokenizer: PreTrainedTokenizer,
     ) -> AsyncGenerator[str, None]:
         model_name = self.served_model_names[0]
         created_time = int(time.time())
@@ -314,10 +219,11 @@ async def chat_completion_stream_generator(
         first_iteration = True
 
         # Send response for each token for each request.n (index)
-        assert request.n is not None
-        previous_texts = [""] * request.n
-        previous_num_tokens = [0] * request.n
-        finish_reason_sent = [False] * request.n
+        num_choices = 1 if request.n is None else request.n
+        previous_texts = [""] * num_choices
+        previous_num_tokens = [0] * num_choices
+        finish_reason_sent = [False] * num_choices
+
         try:
             async for res in result_generator:
                 # We need to do it here, because if there are exceptions in
@@ -327,7 +233,7 @@ async def chat_completion_stream_generator(
                     # Send first response for each request.n (index) with
                     # the role
                     role = self.get_chat_request_role(request)
-                    for i in range(request.n):
+                    for i in range(num_choices):
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=DeltaMessage(role=role),
@@ -355,19 +261,19 @@ async def chat_completion_stream_generator(
                             last_msg_content = conversation[-1]["content"]
 
                         if last_msg_content:
-                            for i in range(request.n):
+                            for i in range(num_choices):
                                 choice_data = (
                                     ChatCompletionResponseStreamChoice(
                                         index=i,
                                         delta=DeltaMessage(
                                             content=last_msg_content),
+                                        logprobs=None,
                                         finish_reason=None))
                                 chunk = ChatCompletionStreamResponse(
                                     id=request_id,
                                     object=chunk_object_type,
                                     created=created_time,
                                     choices=[choice_data],
-                                    logprobs=None,
                                     model=model_name)
                                 if (request.stream_options and
                                         request.stream_options.include_usage):
@@ -393,6 +299,7 @@ async def chat_completion_stream_generator(
                         logprobs = self._create_chat_logprobs(
                             token_ids=delta_token_ids,
                             top_logprobs=out_logprobs,
+                            tokenizer=tokenizer,
                             num_output_top_logprobs=request.top_logprobs,
                         )
                     else:
@@ -481,9 +388,13 @@ async def chat_completion_stream_generator(
         yield "data: [DONE]\n\n"
 
     async def chat_completion_full_generator(
-        self, request: ChatCompletionRequest, raw_request: Optional[Request],
-        result_generator: AsyncIterator[RequestOutput], request_id: str,
-        conversation: List[ConversationMessage]
+        self,
+        request: ChatCompletionRequest,
+        raw_request: Optional[Request],
+        result_generator: AsyncIterator[RequestOutput],
+        request_id: str,
+        conversation: List[ConversationMessage],
+        tokenizer: PreTrainedTokenizer,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
         model_name = self.served_model_names[0]
@@ -511,6 +422,7 @@ async def chat_completion_full_generator(
                     token_ids=token_ids,
                     top_logprobs=out_logprobs,
                     num_output_top_logprobs=request.top_logprobs,
+                    tokenizer=tokenizer,
                 )
             else:
                 logprobs = None
@@ -565,16 +477,14 @@ async def chat_completion_full_generator(
         return response
 
     def _get_top_logprobs(
-            self, logprobs: Dict[int, Logprob],
-            top_logprobs: Optional[int]) -> List[ChatCompletionLogProb]:
+            self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
+            tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]:
         return [
             ChatCompletionLogProb(
-                token=self._get_decoded_token(p[1], p[0]),
+                token=(token := self._get_decoded_token(p[1], p[0],
+                                                        tokenizer)),
                 logprob=max(p[1].logprob, -9999.0),
-                bytes=list(
-                    self._get_decoded_token(p[1],
-                                            p[0]).encode("utf-8",
-                                                         errors="replace")))
+                bytes=list(token.encode("utf-8", errors="replace")))
             for i, p in enumerate(logprobs.items())
             if top_logprobs and i < top_logprobs
         ]
@@ -583,6 +493,7 @@ def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
+        tokenizer: PreTrainedTokenizer,
         num_output_top_logprobs: Optional[int] = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
@@ -592,12 +503,11 @@ def _create_chat_logprobs(
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
+                token = tokenizer.decode(token_id)
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
-                        token=self.tokenizer.decode(token_id),
-                        bytes=list(
-                            self.tokenizer.decode(token_id).encode(
-                                "utf-8", errors="replace"))))
+                        token=token,
+                        bytes=list(token.encode("utf-8", errors="replace"))))
             else:
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
@@ -608,6 +518,7 @@ def _create_chat_logprobs(
                             step_top_logprobs[token_id].decoded_token.encode(
                                 "utf-8", errors="replace")),
                         top_logprobs=self._get_top_logprobs(
-                            step_top_logprobs, num_output_top_logprobs)))
+                            step_top_logprobs, num_output_top_logprobs,
+                            tokenizer)))
 
         return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c775fa6daa739..6aef4c9f96150 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -2,12 +2,14 @@
 from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
                     Optional)
 from typing import Sequence as GenericSequence
-from typing import Tuple
+from typing import Tuple, cast
 
 from fastapi import Request
+from transformers import PreTrainedTokenizer
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
@@ -17,8 +19,10 @@
                                               CompletionResponseStreamChoice,
                                               CompletionStreamResponse,
                                               UsageInfo)
+# yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
-                                                    OpenAIServing)
+                                                    OpenAIServing,
+                                                    PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
@@ -36,38 +40,24 @@
     [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs]
 
 
-def parse_prompt_format(prompt) -> Tuple[bool, list]:
-    # get the prompt, openai supports the following
-    # "a string, array of strings, array of tokens, or array of token arrays."
-    prompt_is_tokens = False
-    prompts = [prompt]  # case 1: a string
-    if isinstance(prompt, list):
-        if len(prompt) == 0:
-            raise ValueError("please provide at least one prompt")
-        elif isinstance(prompt[0], str):
-            prompt_is_tokens = False
-            prompts = prompt  # case 2: array of strings
-        elif isinstance(prompt[0], int):
-            prompt_is_tokens = True
-            prompts = [prompt]  # case 3: array of tokens
-        elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int):
-            prompt_is_tokens = True
-            prompts = prompt  # case 4: array of token arrays
-        else:
-            raise ValueError("prompt must be a string, array of strings, "
-                             "array of tokens, or array of token arrays")
-    return prompt_is_tokens, prompts
-
-
 class OpenAIServingCompletion(OpenAIServing):
 
-    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
-                 served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]],
+        request_logger: Optional[RequestLogger],
+    ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
-                         lora_modules=lora_modules)
+                         lora_modules=lora_modules,
+                         prompt_adapters=prompt_adapters,
+                         request_logger=request_logger)
 
     async def create_completion(self, request: CompletionRequest,
                                 raw_request: Request):
@@ -96,36 +86,45 @@ async def create_completion(self, request: CompletionRequest,
         # Schedule the request and get the result generator.
         generators: List[AsyncIterator[RequestOutput]] = []
         try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine.get_tokenizer(lora_request)
+
             sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
             guided_decode_logit_processor = (
-                await get_guided_decoding_logits_processor(
-                    guided_decoding_backend, request, await
-                    self.engine.get_tokenizer()))
+                await
+                get_guided_decoding_logits_processor(guided_decoding_backend,
+                                                     request, tokenizer))
             if guided_decode_logit_processor is not None:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
                 sampling_params.logits_processors.append(
                     guided_decode_logit_processor)
-            prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
-
-            for i, prompt in enumerate(prompts):
-                if prompt_is_tokens:
-                    prompt_formats = self._validate_prompt_and_tokenize(
-                        request,
-                        prompt_ids=prompt,
-                        truncate_prompt_tokens=sampling_params.
-                        truncate_prompt_tokens)
-                else:
-                    prompt_formats = self._validate_prompt_and_tokenize(
-                        request,
-                        prompt=prompt,
-                        truncate_prompt_tokens=sampling_params.
-                        truncate_prompt_tokens)
-                prompt_ids, prompt_text = prompt_formats
+
+            prompts = list(
+                self._tokenize_prompt_input_or_inputs(
+                    request,
+                    tokenizer,
+                    request.prompt,
+                    truncate_prompt_tokens=sampling_params.
+                    truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                ))
+
+            for i, prompt_inputs in enumerate(prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 prompt_inputs,
+                                 params=sampling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
 
                 is_tracing_enabled = await self.engine.is_tracing_enabled()
                 trace_headers = None
@@ -136,13 +135,11 @@ async def create_completion(self, request: CompletionRequest,
                     log_tracing_disabled_warning()
 
                 generator = self.engine.generate(
-                    {
-                        "prompt": prompt_text,
-                        "prompt_token_ids": prompt_ids
-                    },
+                    {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     sampling_params,
-                    f"{request_id}-{i}",
+                    request_id_item,
                     lora_request=lora_request,
+                    prompt_adapter_request=prompt_adapter_request,
                     trace_headers=trace_headers,
                 )
 
@@ -169,7 +166,8 @@ async def create_completion(self, request: CompletionRequest,
                                                     request_id,
                                                     created_time,
                                                     model_name,
-                                                    num_prompts=len(prompts))
+                                                    num_prompts=len(prompts),
+                                                    tokenizer=tokenizer)
 
         # Non-streaming response
         final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
@@ -180,8 +178,27 @@ async def create_completion(self, request: CompletionRequest,
                     await self.engine.abort(f"{request_id}-{i}")
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
+
+            for i, final_res in enumerate(final_res_batch):
+                assert final_res is not None
+
+                # The output should contain the input text
+                # We did not pass it into vLLM engine to avoid being redundant
+                # with the inputs token IDs
+                if final_res.prompt is None:
+                    final_res.prompt = prompts[i]["prompt"]
+
+            final_res_batch_checked = cast(List[RequestOutput],
+                                           final_res_batch)
+
             response = self.request_output_to_completion_response(
-                final_res_batch, request, request_id, created_time, model_name)
+                final_res_batch_checked,
+                request,
+                request_id,
+                created_time,
+                model_name,
+                tokenizer,
+            )
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -208,11 +225,12 @@ async def completion_stream_generator(
         created_time: int,
         model_name: str,
         num_prompts: int,
+        tokenizer: PreTrainedTokenizer,
     ) -> AsyncGenerator[str, None]:
-        assert request.n is not None
-        previous_texts = [""] * request.n * num_prompts
-        previous_num_tokens = [0] * request.n * num_prompts
-        has_echoed = [False] * request.n * num_prompts
+        num_choices = 1 if request.n is None else request.n
+        previous_texts = [""] * num_choices * num_prompts
+        previous_num_tokens = [0] * num_choices * num_prompts
+        has_echoed = [False] * num_choices * num_prompts
 
         try:
             async for prompt_idx, res in result_generator:
@@ -223,7 +241,7 @@ async def completion_stream_generator(
                     raise StopAsyncIteration()
 
                 for output in res.outputs:
-                    i = output.index + prompt_idx * request.n
+                    i = output.index + prompt_idx * num_choices
                     # TODO(simon): optimize the performance by avoiding full
                     # text O(n^2) sending.
 
@@ -258,6 +276,7 @@ async def completion_stream_generator(
                             token_ids=delta_token_ids,
                             top_logprobs=out_logprobs,
                             num_output_top_logprobs=request.logprobs,
+                            tokenizer=tokenizer,
                             initial_text_offset=len(previous_texts[i]),
                         )
                     else:
@@ -267,16 +286,6 @@ async def completion_stream_generator(
                     previous_num_tokens[i] = len(output.token_ids)
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
-                    if output.finish_reason is not None:  # return final usage
-                        prompt_tokens = len(res.prompt_token_ids)
-                        completion_tokens = len(output.token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=prompt_tokens + completion_tokens,
-                        )
-                    else:
-                        final_usage = None
 
                     chunk = CompletionStreamResponse(
                         id=request_id,
@@ -293,9 +302,21 @@ async def completion_stream_generator(
                         ])
                     if (request.stream_options
                             and request.stream_options.include_usage):
-                        chunk.usage = None
+                        if (request.stream_options.continuous_usage_stats
+                                or output.finish_reason is not None):
+                            prompt_tokens = len(res.prompt_token_ids)
+                            completion_tokens = len(output.token_ids)
+                            usage = UsageInfo(
+                                prompt_tokens=prompt_tokens,
+                                completion_tokens=completion_tokens,
+                                total_tokens=prompt_tokens + completion_tokens,
+                            )
+                        if request.stream_options.continuous_usage_stats:
+                            chunk.usage = usage
+                        else:
+                            chunk.usage = None
 
-                    response_json = chunk.model_dump_json(exclude_unset=True)
+                    response_json = chunk.model_dump_json(exclude_unset=False)
                     yield f"data: {response_json}\n\n"
 
             if (request.stream_options
@@ -305,10 +326,10 @@ async def completion_stream_generator(
                     created=created_time,
                     model=model_name,
                     choices=[],
-                    usage=final_usage,
+                    usage=usage,
                 )
                 final_usage_data = (final_usage_chunk.model_dump_json(
-                    exclude_unset=True, exclude_none=True))
+                    exclude_unset=False, exclude_none=True))
                 yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:
@@ -324,12 +345,13 @@ def request_output_to_completion_response(
         request_id: str,
         created_time: int,
         model_name: str,
+        tokenizer: PreTrainedTokenizer,
     ) -> CompletionResponse:
         choices: List[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
+
         for final_res in final_res_batch:
-            assert final_res is not None
             prompt_token_ids = final_res.prompt_token_ids
             prompt_logprobs = final_res.prompt_logprobs
             prompt_text = final_res.prompt
@@ -341,7 +363,7 @@ def request_output_to_completion_response(
                     out_logprobs = prompt_logprobs
                     output_text = prompt_text
                 elif request.echo and request.max_tokens > 0:
-                    token_ids = prompt_token_ids + output.token_ids
+                    token_ids = prompt_token_ids + list(output.token_ids)
                     out_logprobs = (prompt_logprobs + output.logprobs
                                     if request.logprobs is not None else None)
                     output_text = prompt_text + output.text
@@ -355,6 +377,7 @@ def request_output_to_completion_response(
                     logprobs = self._create_completion_logprobs(
                         token_ids=token_ids,
                         top_logprobs=out_logprobs,
+                        tokenizer=tokenizer,
                         num_output_top_logprobs=request.logprobs,
                     )
                 else:
@@ -392,6 +415,7 @@ def _create_completion_logprobs(
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
         num_output_top_logprobs: int,
+        tokenizer: PreTrainedTokenizer,
         initial_text_offset: int = 0,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
@@ -405,13 +429,13 @@ def _create_completion_logprobs(
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
-                token = self.tokenizer.decode(token_id)
+                token = tokenizer.decode(token_id)
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
                 out_top_logprobs.append(None)
             else:
                 token = self._get_decoded_token(step_top_logprobs[token_id],
-                                                token_id)
+                                                token_id, tokenizer)
                 token_logprob = max(step_top_logprobs[token_id].logprob,
                                     -9999.0)
                 out_tokens.append(token)
@@ -424,7 +448,7 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(top_lp[1], top_lp[0]):
+                    self._get_decoded_token(top_lp[1], top_lp[0], tokenizer):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index cbf09f173fb66..bccc90894e79f 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,14 +1,16 @@
+import base64
 import time
-from typing import AsyncIterator, List, Optional, Tuple
+from typing import AsyncIterator, List, Optional, Tuple, cast
 
+import numpy as np
 from fastapi import Request
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData, UsageInfo)
-from vllm.entrypoints.openai.serving_completion import parse_prompt_format
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.logger import init_logger
 from vllm.outputs import EmbeddingRequestOutput
@@ -20,19 +22,18 @@
 
 
 def request_output_to_embedding_response(
-    final_res_batch: List[EmbeddingRequestOutput],
-    request_id: str,
-    created_time: int,
-    model_name: str,
-) -> EmbeddingResponse:
+        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        created_time: int, model_name: str,
+        encoding_format: str) -> EmbeddingResponse:
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
-        assert final_res is not None
         prompt_token_ids = final_res.prompt_token_ids
-
-        embedding_data = EmbeddingResponseData(
-            index=idx, embedding=final_res.outputs.embedding)
+        embedding = final_res.outputs.embedding
+        if encoding_format == "base64":
+            embedding_bytes = np.array(embedding).tobytes()
+            embedding = base64.b64encode(embedding_bytes).decode("utf-8")
+        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
         num_prompt_tokens += len(prompt_token_ids)
@@ -53,12 +54,20 @@ def request_output_to_embedding_response(
 
 class OpenAIServingEmbedding(OpenAIServing):
 
-    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
-                 served_model_names: List[str]):
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        *,
+        request_logger: Optional[RequestLogger],
+    ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
-                         lora_modules=None)
+                         lora_modules=None,
+                         prompt_adapters=None,
+                         request_logger=request_logger)
         self._check_embedding_mode(model_config.embedding_mode)
 
     async def create_embedding(self, request: EmbeddingRequest,
@@ -72,41 +81,54 @@ async def create_embedding(self, request: EmbeddingRequest,
         if error_check_ret is not None:
             return error_check_ret
 
-        # Return error for unsupported features.
-        if request.encoding_format == "base64":
-            return self.create_error_response(
-                "base64 encoding is not currently supported")
+        encoding_format = (request.encoding_format
+                           if request.encoding_format else "float")
         if request.dimensions is not None:
             return self.create_error_response(
                 "dimensions is currently not supported")
 
         model_name = request.model
-        request_id = f"cmpl-{random_uuid()}"
+        request_id = f"embd-{random_uuid()}"
         created_time = int(time.monotonic())
 
         # Schedule the request and get the result generator.
-        generators = []
+        generators: List[AsyncIterator[EmbeddingRequestOutput]] = []
         try:
-            prompt_is_tokens, prompts = parse_prompt_format(request.input)
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine.get_tokenizer(lora_request)
+
             pooling_params = request.to_pooling_params()
 
-            for i, prompt in enumerate(prompts):
-                if prompt_is_tokens:
-                    prompt_formats = self._validate_prompt_and_tokenize(
-                        request, prompt_ids=prompt)
-                else:
-                    prompt_formats = self._validate_prompt_and_tokenize(
-                        request, prompt=prompt)
+            prompts = list(
+                self._tokenize_prompt_input_or_inputs(
+                    request,
+                    tokenizer,
+                    request.input,
+                ))
+
+            for i, prompt_inputs in enumerate(prompts):
+                request_id_item = f"{request_id}-{i}"
 
-                prompt_ids, prompt_text = prompt_formats
+                self._log_inputs(request_id_item,
+                                 prompt_inputs,
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                if prompt_adapter_request is not None:
+                    raise NotImplementedError(
+                        "Prompt adapter is not supported "
+                        "for embedding models")
 
                 generator = self.engine.encode(
-                    {
-                        "prompt": prompt_text,
-                        "prompt_token_ids": prompt_ids
-                    },
+                    {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     pooling_params,
-                    f"{request_id}-{i}",
+                    request_id_item,
+                    lora_request=lora_request,
                 )
 
                 generators.append(generator)
@@ -125,11 +147,18 @@ async def create_embedding(self, request: EmbeddingRequest,
                 if await raw_request.is_disconnected():
                     # Abort the request if the client disconnects.
                     await self.engine.abort(f"{request_id}-{i}")
-                    # TODO: Use a vllm-specific Validation Error
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
+
+            for final_res in final_res_batch:
+                assert final_res is not None
+
+            final_res_batch_checked = cast(List[EmbeddingRequestOutput],
+                                           final_res_batch)
+
             response = request_output_to_embedding_response(
-                final_res_batch, request_id, created_time, model_name)
+                final_res_batch_checked, request_id, created_time, model_name,
+                encoding_format)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 6b5a62efc7f20..8c6bd10b9b4d4 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,63 +1,108 @@
 import json
+import pathlib
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
 
 from pydantic import Field
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
+                                              DetokenizeRequest,
                                               EmbeddingRequest, ErrorResponse,
                                               ModelCard, ModelList,
-                                              ModelPermission)
+                                              ModelPermission,
+                                              TokenizeChatRequest,
+                                              TokenizeCompletionRequest,
+                                              TokenizeRequest)
+# yapf: enable
+from vllm.inputs import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 logger = init_logger(__name__)
 
 
 @dataclass
-class LoRAModulePath:
+class PromptAdapterPath:
     name: str
     local_path: str
 
 
+@dataclass
+class LoRAModulePath:
+    name: str
+    path: str
+
+
+AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
+                   EmbeddingRequest, TokenizeRequest]
+
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+
+class TextTokensPrompt(TypedDict):
+    prompt: str
+    prompt_token_ids: List[int]
+
+
 class OpenAIServing:
 
-    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
-                 served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]],
+        request_logger: Optional[RequestLogger],
+    ):
         super().__init__()
 
         self.engine = engine
+        self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
-        # A separate tokenizer to map token IDs to strings.
-        self.tokenizer = get_tokenizer(
-            model_config.tokenizer,
-            tokenizer_mode=model_config.tokenizer_mode,
-            tokenizer_revision=model_config.tokenizer_revision,
-            trust_remote_code=model_config.trust_remote_code,
-            truncation_side="left")
-
         self.served_model_names = served_model_names
 
-        if lora_modules is None:
-            self.lora_requests = []
-        else:
+        self.lora_requests = []
+        if lora_modules is not None:
             self.lora_requests = [
                 LoRARequest(
                     lora_name=lora.name,
                     lora_int_id=i,
-                    lora_local_path=lora.local_path,
+                    lora_path=lora.path,
                 ) for i, lora in enumerate(lora_modules, start=1)
             ]
 
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with pathlib.Path(prompt_adapter.local_path,
+                                  "adapter_config.json").open() as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+
+        self.request_logger = request_logger
+
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
@@ -73,7 +118,14 @@ async def show_available_models(self) -> ModelList:
                       permission=[ModelPermission()])
             for lora in self.lora_requests
         ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.served_model_names[0],
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
         model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
         return ModelList(data=model_cards)
 
     def create_error_response(
@@ -99,69 +151,82 @@ def create_streaming_error_response(
         return json_str
 
     async def _check_model(
-        self, request: Union[CompletionRequest, ChatCompletionRequest,
-                             EmbeddingRequest]
+        self,
+        request: AnyRequest,
     ) -> Optional[ErrorResponse]:
         if request.model in self.served_model_names:
             return None
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return None
+        if request.model in [
+                prompt_adapter.prompt_adapter_name
+                for prompt_adapter in self.prompt_adapter_requests
+        ]:
+            return None
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
 
-    def _maybe_get_lora(
-        self, request: Union[CompletionRequest, ChatCompletionRequest,
-                             EmbeddingRequest]
-    ) -> Optional[LoRARequest]:
+    def _maybe_get_adapters(
+        self, request: AnyRequest
+    ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[
+            None, PromptAdapterRequest]]:
         if request.model in self.served_model_names:
-            return None
+            return None, None
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
-                return lora
+                return lora, None
+        for prompt_adapter in self.prompt_adapter_requests:
+            if request.model == prompt_adapter.prompt_adapter_name:
+                return None, prompt_adapter
         # if _check_model has been called earlier, this will be unreachable
         raise ValueError(f"The model `{request.model}` does not exist.")
 
-    def _validate_prompt_and_tokenize(
-            self,
-            request: Union[ChatCompletionRequest, CompletionRequest,
-                           EmbeddingRequest],
-            prompt: Optional[str] = None,
-            prompt_ids: Optional[List[int]] = None,
-            truncate_prompt_tokens: Optional[Annotated[int,
-                                                       Field(ge=1)]] = None,
-            add_special_tokens: Optional[bool] = True
-    ) -> Tuple[List[int], str]:
-        if not (prompt or prompt_ids):
-            raise ValueError("Either prompt or prompt_ids should be provided.")
-        if (prompt and prompt_ids):
-            raise ValueError(
-                "Only one of prompt or prompt_ids should be provided.")
-
-        if prompt_ids is None:
-            # When using OpenAIServingChat for chat completions, for
-            # most models the special tokens (e.g., BOS) have already
-            # been added by the chat template. Therefore, we do not
-            # need to add them again.
-            # Set add_special_tokens to False (by default) to avoid
-            # adding the BOS tokens again.
-            tokenizer_kwargs: Dict[str, Any] = {
-                "add_special_tokens": add_special_tokens
-            }
-            if truncate_prompt_tokens is not None:
-                tokenizer_kwargs.update({
-                    "truncation": True,
-                    "max_length": truncate_prompt_tokens,
-                })
-            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
-        elif truncate_prompt_tokens is not None:
-            input_ids = prompt_ids[-truncate_prompt_tokens:]
+    def _normalize_prompt_text_to_input(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt: str,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
+        add_special_tokens: bool,
+    ) -> TextTokensPrompt:
+        if truncate_prompt_tokens is None:
+            encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
         else:
+            encoded = tokenizer(prompt,
+                                add_special_tokens=add_special_tokens,
+                                truncation=True,
+                                max_length=truncate_prompt_tokens)
+
+        input_ids = encoded.input_ids
+
+        input_text = prompt
+
+        return self._validate_input(request, input_ids, input_text)
+
+    def _normalize_prompt_tokens_to_input(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_ids: List[int],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
+    ) -> TextTokensPrompt:
+        if truncate_prompt_tokens is None:
             input_ids = prompt_ids
+        else:
+            input_ids = prompt_ids[-truncate_prompt_tokens:]
+
+        input_text = tokenizer.decode(input_ids)
 
-        input_text = prompt if prompt is not None else self.tokenizer.decode(
-            prompt_ids)
+        return self._validate_input(request, input_ids, input_text)
+
+    def _validate_input(
+        self,
+        request: AnyRequest,
+        input_ids: List[int],
+        input_text: str,
+    ) -> TextTokensPrompt:
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest doesn't have max_tokens
@@ -171,8 +236,16 @@ def _validate_prompt_and_tokenize(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the input for embedding "
-                    f"generation. Please reduce the length of the input.", )
-            return input_ids, input_text
+                    f"generation. Please reduce the length of the input.")
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
+
+        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
+        # and does not require model context length validation
+        if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest,
+                                DetokenizeRequest)):
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
 
         if request.max_tokens is None:
             if token_num >= self.max_model_len:
@@ -180,7 +253,7 @@ def _validate_prompt_and_tokenize(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the messages, "
-                    f"Please reduce the length of the messages.", )
+                    f"Please reduce the length of the messages.")
             request.max_tokens = self.max_model_len - token_num
 
         if token_num + request.max_tokens > self.max_model_len:
@@ -190,11 +263,132 @@ def _validate_prompt_and_tokenize(
                 f"{request.max_tokens + token_num} tokens "
                 f"({token_num} in the messages, "
                 f"{request.max_tokens} in the completion). "
-                f"Please reduce the length of the messages or completion.", )
+                f"Please reduce the length of the messages or completion.")
+
+        return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+
+    def _tokenize_prompt_input(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_input: Union[str, List[int]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> TextTokensPrompt:
+        """
+        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
+        that assumes single input.
+        """
+        return next(
+            self._tokenize_prompt_inputs(
+                request,
+                tokenizer,
+                [prompt_input],
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens,
+            ))
+
+    def _tokenize_prompt_inputs(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        prompt_inputs: Iterable[Union[str, List[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> Iterator[TextTokensPrompt]:
+        """
+        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
+        that assumes multiple inputs.
+        """
+        for text in prompt_inputs:
+            if isinstance(text, str):
+                yield self._normalize_prompt_text_to_input(
+                    request,
+                    tokenizer,
+                    prompt=text,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=add_special_tokens,
+                )
+            else:
+                yield self._normalize_prompt_tokens_to_input(
+                    request,
+                    tokenizer,
+                    prompt_ids=text,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                )
+
+    def _tokenize_prompt_input_or_inputs(
+        self,
+        request: AnyRequest,
+        tokenizer: AnyTokenizer,
+        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        add_special_tokens: bool = True,
+    ) -> Iterator[TextTokensPrompt]:
+        """
+        Tokenize/detokenize depending on the input format.
+
+        According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_
+        , each input can be a string or array of tokens. Note that each request
+        can pass one or more inputs.
+        """
+        for prompt_input in parse_and_batch_prompt(input_or_inputs):
+            # Although our type checking is based on mypy,
+            # VSCode Pyright extension should still work properly
+            # "is True" is required for Pyright to perform type narrowing
+            # See: https://github.com/microsoft/pyright/issues/7672
+            if prompt_input["is_tokens"] is False:
+                yield self._normalize_prompt_text_to_input(
+                    request,
+                    tokenizer,
+                    prompt=prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=add_special_tokens,
+                )
+            else:
+                yield self._normalize_prompt_tokens_to_input(
+                    request,
+                    tokenizer,
+                    prompt_ids=prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                )
+
+    def _log_inputs(
+        self,
+        request_id: str,
+        inputs: Union[str, List[int], TextTokensPrompt],
+        params: Optional[Union[SamplingParams, PoolingParams]],
+        lora_request: Optional[LoRARequest],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> None:
+        if self.request_logger is None:
+            return
+
+        if isinstance(inputs, str):
+            prompt = inputs
+            prompt_token_ids = None
+        elif isinstance(inputs, list):
+            prompt = None
+            prompt_token_ids = inputs
         else:
-            return input_ids, input_text
+            prompt = inputs["prompt"]
+            prompt_token_ids = inputs["prompt_token_ids"]
+
+        self.request_logger.log_inputs(
+            request_id,
+            prompt,
+            prompt_token_ids,
+            params=params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
 
-    def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str:
+    @staticmethod
+    def _get_decoded_token(
+        logprob: Logprob,
+        token_id: int,
+        tokenizer: AnyTokenizer,
+    ) -> str:
         if logprob.decoded_token is not None:
             return logprob.decoded_token
-        return self.tokenizer.decode(token_id)
+        return tokenizer.decode(token_id)
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
new file mode 100644
index 0000000000000..94e1b03ed4036
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -0,0 +1,135 @@
+from typing import List, Optional, Union
+
+from vllm.config import ModelConfig
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.chat_utils import (ConversationMessage,
+                                         load_chat_template,
+                                         parse_chat_message_content)
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              ErrorResponse,
+                                              TokenizeChatRequest,
+                                              TokenizeRequest,
+                                              TokenizeResponse)
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    OpenAIServing)
+from vllm.utils import random_uuid
+
+
+class OpenAIServingTokenization(OpenAIServing):
+
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+    ):
+        super().__init__(engine=engine,
+                         model_config=model_config,
+                         served_model_names=served_model_names,
+                         lora_modules=lora_modules,
+                         prompt_adapters=None,
+                         request_logger=request_logger)
+
+        # If this is None we use the tokenizer's default chat template
+        self.chat_template = load_chat_template(chat_template)
+
+    async def create_tokenize(
+        self,
+        request: TokenizeRequest,
+    ) -> Union[TokenizeResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"tokn-{random_uuid()}"
+
+        (
+            lora_request,
+            prompt_adapter_request,
+        ) = self._maybe_get_adapters(request)
+
+        tokenizer = await self.engine.get_tokenizer(lora_request)
+
+        if isinstance(request, TokenizeChatRequest):
+            model_config = self.model_config
+
+            conversation: List[ConversationMessage] = []
+
+            for message in request.messages:
+                result = parse_chat_message_content(message, model_config,
+                                                    tokenizer)
+                conversation.extend(result.messages)
+
+            prompt = tokenizer.apply_chat_template(
+                add_generation_prompt=request.add_generation_prompt,
+                conversation=conversation,
+                tokenize=False,
+                chat_template=self.chat_template)
+            assert isinstance(prompt, str)
+        else:
+            prompt = request.prompt
+
+        self._log_inputs(request_id,
+                         prompt,
+                         params=None,
+                         lora_request=lora_request,
+                         prompt_adapter_request=prompt_adapter_request)
+
+        # Silently ignore prompt adapter since it does not affect tokenization
+
+        prompt_input = self._tokenize_prompt_input(
+            request,
+            tokenizer,
+            prompt,
+            add_special_tokens=request.add_special_tokens,
+        )
+        input_ids = prompt_input["prompt_token_ids"]
+
+        return TokenizeResponse(tokens=input_ids,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+
+    async def create_detokenize(
+        self,
+        request: DetokenizeRequest,
+    ) -> Union[DetokenizeResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"tokn-{random_uuid()}"
+
+        (
+            lora_request,
+            prompt_adapter_request,
+        ) = self._maybe_get_adapters(request)
+
+        tokenizer = await self.engine.get_tokenizer(lora_request)
+
+        self._log_inputs(request_id,
+                         request.tokens,
+                         params=None,
+                         lora_request=lora_request,
+                         prompt_adapter_request=prompt_adapter_request)
+
+        if prompt_adapter_request is not None:
+            raise NotImplementedError("Prompt adapter is not supported "
+                                      "for tokenization")
+
+        prompt_input = self._tokenize_prompt_input(
+            request,
+            tokenizer,
+            request.tokens,
+        )
+        input_text = prompt_input["prompt"]
+
+        return DetokenizeResponse(prompt=input_text)
diff --git a/vllm/envs.py b/vllm/envs.py
index 49277e2d3519f..595992e51db87 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -17,7 +17,8 @@
     S3_ACCESS_KEY_ID: Optional[str] = None
     S3_SECRET_ACCESS_KEY: Optional[str] = None
     S3_ENDPOINT_URL: Optional[str] = None
-    VLLM_CONFIG_ROOT: str = ""
+    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
+    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
     VLLM_NO_USAGE_STATS: bool = False
     VLLM_DO_NOT_TRACK: bool = False
@@ -28,18 +29,40 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
-    VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
+    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
+    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
+    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_INSTALL_PUNICA_KERNELS: bool = False
+    VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
 
+
+def get_default_cache_root():
+    return os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache"),
+    )
+
+
+def get_default_config_root():
+    return os.getenv(
+        "XDG_CONFIG_HOME",
+        os.path.join(os.path.expanduser("~"), ".config"),
+    )
+
+
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
@@ -49,7 +72,8 @@
 
     # ================== Installation Time Env Vars ==================
 
-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
@@ -83,15 +107,28 @@
     lambda: bool(int(os.getenv('VERBOSE', '0'))),
 
     # Root directory for VLLM configuration files
+    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
     # Note that this not only affects how vllm finds its configuration files
     # during runtime, but also affects how vllm installs its configuration
     # files during **installation**.
     "VLLM_CONFIG_ROOT":
-    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
-        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CONFIG_ROOT",
+            os.path.join(get_default_config_root(), "vllm"),
+        )),
 
     # ================== Runtime Env Vars ==================
 
+    # Root directory for VLLM cache files
+    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
+    "VLLM_CACHE_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CACHE_ROOT",
+            os.path.join(get_default_cache_root(), "vllm"),
+        )),
+
     # used in distributed environment to determine the master address
     'VLLM_HOST_IP':
     lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
@@ -200,6 +237,7 @@
     # - "FLASH_ATTN": use FlashAttention
     # - "XFORMERS": use XFormers
     # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
@@ -208,6 +246,29 @@
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "VLLM_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
+
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+
+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
+    "VLLM_USE_RAY_SPMD_WORKER":
+    lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
+
     # If the env var is set, it uses the Ray's compiled DAG API
     # which optimizes the control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -219,6 +280,14 @@
     "VLLM_WORKER_MULTIPROC_METHOD":
     lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
 
+    # Path to the cache for storing downloaded assets
+    "VLLM_ASSETS_CACHE":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "vllm", "assets"),
+        )),
+
     # Timeout for fetching images when serving multimodal models
     # Default is 5 seconds
     "VLLM_IMAGE_FETCH_TIMEOUT":
@@ -227,13 +296,23 @@
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
     "VLLM_XLA_CACHE_PATH":
-    lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
+        )),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+    # If set, vllm will skip the deprecation warnings.
+    "VLLM_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
 }
 
 # end-env-vars-definition
 
 
-def __getattr__(name):
+def __getattr__(name: str):
     # lazy evaluation of environment variables
     if name in environment_variables:
         return environment_variables[name]()
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index a2212459f034e..23e429dac7232 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -7,6 +7,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
@@ -16,6 +17,8 @@
 
 class CPUExecutor(ExecutorBase):
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
@@ -46,8 +49,9 @@ def _init_worker(self):
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
+            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=True,
         )
         self.driver_worker.init_device()
@@ -84,9 +88,25 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+
     def check_health(self) -> None:
         # CPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index f7c608af1ad39..4df54a09e5e8c 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -69,17 +69,19 @@ def execute_model(
         if self.parallel_worker_tasks is None:
             self.parallel_worker_tasks = self._run_workers(
                 "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
+                async_run_tensor_parallel_workers_only=True,
                 **self.extra_execute_model_run_workers_kwargs)
 
         # Only the driver worker returns the sampling results.
-        return self._driver_execute_model(execute_model_req)
+        driver_outputs = self._driver_execute_model(execute_model_req)
+        assert driver_outputs is not None
+        return driver_outputs
 
     def stop_remote_worker_execution_loop(self) -> None:
         if self.parallel_worker_tasks is None:
             return
 
-        self._driver_execute_model()
+        self._driver_execute_model(execute_model_req=None)
         parallel_worker_tasks = self.parallel_worker_tasks
         self.parallel_worker_tasks = None
         # Ensure that workers exit model loop cleanly
@@ -100,6 +102,13 @@ def remove_lora(self, lora_id: int) -> bool:
             lora_id=lora_id,
         )
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        )
+
     def list_loras(self) -> Set[int]:
         return self._run_workers("list_loras")
 
@@ -116,13 +125,13 @@ def save_sharded_state(
 
     @abstractmethod
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
         """
         raise NotImplementedError
 
@@ -131,17 +140,17 @@ def _run_workers(
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers.
 
         Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
         """
         raise NotImplementedError
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 4d01939c2e38b..a848bc70941c1 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -2,9 +2,11 @@
 from typing import List, Optional, Set, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 
 
@@ -16,6 +18,8 @@ class ExecutorBase(ABC):
     that can execute the model on multiple devices.
     """
 
+    uses_ray: bool  # whether the executor uses Ray for orchestration.
+
     def __init__(
         self,
         model_config: ModelConfig,
@@ -25,8 +29,9 @@ def __init__(
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
     ) -> None:
         self.model_config = model_config
         self.cache_config = cache_config
@@ -35,8 +40,9 @@ def __init__(
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.speculative_config = speculative_config
+        self.prompt_adapter_config = prompt_adapter_config
 
         self._init_executor()
 
@@ -69,8 +75,8 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     @abstractmethod
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
         """Executes at least one model step on the given sequences."""
         raise NotImplementedError
 
@@ -86,10 +92,31 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
+    @abstractmethod
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+
+    @abstractmethod
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError
+
     @abstractmethod
     def check_health(self) -> None:
         """Checks if the executor is healthy. If not, it should raise an
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 3ad201f4757ec..3e77af0e20323 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -3,6 +3,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
@@ -11,8 +12,19 @@
 logger = init_logger(__name__)
 
 
+def create_worker(worker_module_name, worker_class_name, **kwargs):
+    wrapper = WorkerWrapperBase(
+        worker_module_name=worker_module_name,
+        worker_class_name=worker_class_name,
+    )
+    wrapper.init_worker(**kwargs)
+    return wrapper.worker
+
+
 class GPUExecutor(ExecutorBase):
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         """Initialize the worker and load the model.
         """
@@ -43,30 +55,37 @@ def _get_worker_kwargs(
             rank=rank,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             speculative_config=self.speculative_config,
-            is_driver_worker=rank == 0,
+            prompt_adapter_config=self.prompt_adapter_config,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
         )
 
+    def _get_create_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict:
+        worker_kwargs = self._get_worker_kwargs(local_rank, rank,
+                                                distributed_init_method)
+        if self.speculative_config is None:
+            worker_kwargs.update(worker_module_name="vllm.worker.worker",
+                                 worker_class_name="Worker")
+        else:
+            worker_kwargs.update(
+                worker_module_name="vllm.spec_decode.spec_decode_worker",
+                worker_class_name="create_spec_worker")
+        return worker_kwargs
+
     def _create_worker(self,
                        local_rank: int = 0,
                        rank: int = 0,
                        distributed_init_method: Optional[str] = None):
-
-        if self.speculative_config is None:
-            worker_module_name = "vllm.worker.worker"
-            worker_class_name = "Worker"
-        else:
-            worker_module_name = "vllm.spec_decode.spec_decode_worker"
-            worker_class_name = "create_spec_worker"
-
-        wrapper = WorkerWrapperBase(
-            worker_module_name=worker_module_name,
-            worker_class_name=worker_class_name,
-        )
-        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
-                                                      distributed_init_method))
-        return wrapper.worker
+        return create_worker(**self._get_create_worker_kwargs(
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method))
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
@@ -87,7 +106,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
         output = self.driver_worker.execute_model(execute_model_req)
         return output
 
@@ -99,9 +118,32 @@ def remove_lora(self, lora_id: int) -> bool:
         assert lora_id > 0, "lora_id must be greater than 0."
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+                "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
+
     def check_health(self) -> None:
         # GPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index e63e5a3a027fa..9811fc2a55199 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -1,17 +1,23 @@
 import asyncio
 import os
+import signal
+import weakref
 from functools import partial
 from typing import Any, List, Optional
 
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.gpu_executor import create_worker
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (cuda_device_count_stateless,
+from vllm.triton_utils import maybe_set_triton_cache_manager
+from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
+                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        get_vllm_instance_id, make_async,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -19,14 +25,18 @@
 class MultiprocessingGPUExecutor(DistributedGPUExecutor):
     """Python multiprocessing-based multi-GPU executor"""
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         # Create the parallel GPU workers.
-        world_size = self.parallel_config.tensor_parallel_size
+        world_size = self.parallel_config.world_size
+        tensor_parallel_size = self.parallel_config.tensor_parallel_size
 
         # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            os.environ["CUDA_VISIBLE_DEVICES"] = (",".join(
-                map(str, range(world_size))))
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
 
         # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
         os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
@@ -34,8 +44,26 @@ def _init_executor(self) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
-        assert world_size <= cuda_device_count_stateless(), (
-            "please set tensor_parallel_size to less than max local gpu count")
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
+        # workaround for https://github.com/vllm-project/vllm/issues/6103
+        if world_size > 1:
+            maybe_set_triton_cache_manager()
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        assert tensor_parallel_size <= cuda_device_count, (
+            f"please set tensor_parallel_size ({tensor_parallel_size}) "
+            f"to less than max local gpu count ({cuda_device_count})")
+
+        assert world_size <= cuda_device_count, (
+            f"please ensure that world_size ({world_size}) "
+            f"is less than than max local gpu count ({cuda_device_count})")
+
+        error_on_invalid_device_count_status()
 
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
@@ -43,26 +71,53 @@ def _init_executor(self) -> None:
         distributed_init_method = get_distributed_init_method(
             "127.0.0.1", get_open_port())
 
+        self.workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[ProcessWorkerWrapper] = []
+
         if world_size == 1:
-            self.workers = []
             self.worker_monitor = None
         else:
             result_handler = ResultHandler()
-            self.workers = [
-                ProcessWorkerWrapper(
+            for rank in range(1, world_size):
+                worker = ProcessWorkerWrapper(
                     result_handler,
                     partial(
-                        self._create_worker,
-                        rank=rank,
-                        local_rank=rank,
-                        distributed_init_method=distributed_init_method,
-                    )) for rank in range(1, world_size)
-            ]
+                        create_worker,
+                        **self._get_create_worker_kwargs(
+                            rank=rank,
+                            local_rank=rank,
+                            distributed_init_method=distributed_init_method,
+                        )))
+                self.workers.append(worker)
+                if rank % tensor_parallel_size == 0:
+                    self.tp_driver_workers.append(worker)
+                else:
+                    self.non_driver_workers.append(worker)
 
             self.worker_monitor = WorkerMonitor(self.workers, result_handler)
             result_handler.start()
             self.worker_monitor.start()
 
+        # Set up signal handlers to shutdown the executor cleanly
+        # sometimes gc does not work well
+
+        # Use weakref to avoid holding a reference to self
+        ref = weakref.ref(self)
+
+        def shutdown(signum, frame):
+            if executor := ref():
+                executor.shutdown()
+
+        signal.signal(signal.SIGINT, shutdown)
+        signal.signal(signal.SIGTERM, shutdown)
+
         self.driver_worker = self._create_worker(
             distributed_init_method=distributed_init_method)
         self._run_workers("init_device")
@@ -76,48 +131,49 @@ def shutdown(self):
             worker_monitor.close()
 
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
         loop running in each of the remote workers.
         """
-        return self.driver_worker.execute_model(
-            execute_model_req=execute_model_req)
+        return self.driver_worker.execute_model(execute_model_req)
 
     def _run_workers(
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers.
 
         Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
         """
 
         if max_concurrent_workers:
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        # Start the workers first.
+        if async_run_tensor_parallel_workers_only:
+            # Run only non-driver workers and just return futures.
+            return [
+                worker.execute_method(method, *args, **kwargs)
+                for worker in self.non_driver_workers
+            ]
+
+        # Start all remote workers first.
         worker_outputs = [
             worker.execute_method(method, *args, **kwargs)
             for worker in self.workers
         ]
 
-        if async_run_remote_workers_only:
-            # Just return futures
-            return worker_outputs
-
         driver_worker_method = getattr(self.driver_worker, method)
         driver_worker_output = driver_worker_method(*args, **kwargs)
 
@@ -144,16 +200,45 @@ class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor,
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.driver_exec_model = make_async(self.driver_worker.execute_model)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
 
     async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
-        return await self.driver_exec_model(execute_model_req)
+        if not self.tp_driver_workers:
+            return await self.driver_exec_model(execute_model_req)
+
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
+                                    execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method_async,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
 
     async def _start_worker_execution_loop(self):
         coros = [
             worker.execute_method_async("start_worker_execution_loop")
-            for worker in self.workers
+            for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index e7f0e887921b7..5d4c4f497f470 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -11,6 +11,8 @@
 
 class NeuronExecutor(ExecutorBase):
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         assert (self.lora_config is
                 None), "LoRA is not supported for Neuron backend."
@@ -48,15 +50,14 @@ def initialize_cache(self, num_gpu_blocks: int,
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (execute_model_req.blocks_to_swap_in == {}
-                and execute_model_req.blocks_to_swap_out == {}
-                and execute_model_req.blocks_to_copy == {}), (
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
                     "Cache operations are not supported for Neuron backend.")
         assert execute_model_req.num_lookahead_slots == 0, (
             "lookahead not supported for Neuron backend.")
 
-        output = self.driver_worker.execute_model(
-            execute_model_req.seq_group_metadata_list)
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -65,9 +66,28 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
     def check_health(self) -> None:
         # NeuronExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
new file mode 100644
index 0000000000000..c52a1c9839d7b
--- /dev/null
+++ b/vllm/executor/openvino_executor.py
@@ -0,0 +1,181 @@
+from typing import List, Set, Tuple
+
+import openvino as ov
+import openvino.properties.hint as hints
+import torch
+
+import vllm.envs as envs
+from vllm.config import CacheConfig, ModelConfig
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class OpenVINOExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "openvino"
+        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+
+        # Instantiate the worker and load the model to CPU.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.openvino_worker import OpenVINOWorker
+
+        assert (
+            self.parallel_config.world_size == 1
+        ), "OpenVINOExecutor only supports single CPU socket currently."
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = OpenVINOWorker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            multimodal_config=self.multimodal_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        # NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
+        # referred as `gpu block`. Because we want to reuse the existing block
+        # management procedure.
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def check_health(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
+
+    async def check_health_async(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype != torch.float32:
+        logger.warning(
+            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
+        )
+        config.dtype = torch.float32
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on OpenVINO backend, fallback to the "
+            "eager mode.")
+        config.enforce_eager = True
+    return config
+
+
+def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+        logger.info("KV cache type is overried to u8 via "
+                    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+        config.cache_dtype = ov.Type.u8
+    else:
+        core = ov.Core()
+        inference_precision = core.get_property("CPU",
+                                                hints.inference_precision)
+        if inference_precision == ov.Type.bf16:
+            config.cache_dtype = ov.Type.bf16
+        else:
+            config.cache_dtype = ov.Type.f16
+
+    if config.block_size != 32:
+        logger.info(
+            f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
+        )
+        config.block_size = 32
+
+    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+    if kv_cache_space >= 0:
+        _GB = 1 << 30
+        if kv_cache_space == 0:
+            config.openvino_kvcache_space_bytes = 4 * _GB  # type: ignore
+            logger.warning(
+                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+                "for OpenVINO backend is not set, using 4 by default.")
+        else:
+            config.openvino_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+    else:
+        raise RuntimeError(
+            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+            f" {kv_cache_space}, expect a positive integer value.")
+
+    return config
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index fc83c552888a6..e4aaeaa24c1bc 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import pickle
 from collections import defaultdict
 from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -11,7 +10,9 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (_run_task_with_lock,
+                        error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
 if ray is not None:
@@ -22,13 +23,33 @@
 
 logger = init_logger(__name__)
 
-USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
-
 
 class RayGPUExecutor(DistributedGPUExecutor):
 
+    uses_ray: bool = True
+
     def _init_executor(self) -> None:
-        assert self.parallel_config.distributed_executor_backend == "ray"
+        # If the env var is set, it uses the Ray's compiled DAG API
+        # which optimizes the control plane overhead.
+        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+        # Currently, this requires USE_RAY_SPMD_WORKER=True.
+        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
+        # If the env var is set, then we do not distinguish between the
+        # "driver worker" vs other workers. Also, the rank 0 worker will
+        # be executed in a remote Ray worker. Currently this requires
+        # USE_RAY_COMPILED_DAG=True.
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if self.use_ray_compiled_dag:
+            assert self.use_ray_spmd_worker, (
+                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
+                "VLLM_USE_RAY_SPMD_WORKER=1")
+        if self.use_ray_spmd_worker:
+            # TODO: Support SPMD worker for non-DAG Ray executor.
+            assert self.use_ray_compiled_dag, (
+                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
+                "VLLM_USE_RAY_COMPILED_DAG=1")
+
+        assert self.uses_ray
         placement_group = self.parallel_config.placement_group
 
         # Disable Ray usage stats collection.
@@ -39,11 +60,7 @@ def _init_executor(self) -> None:
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        self.forward_dag = None
-        if USE_RAY_COMPILED_DAG:
-            self.forward_dag = self._compiled_ray_dag()
-            self.extra_execute_model_run_workers_kwargs[
-                "use_ray_compiled_dag"] = True
+        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
 
     def _configure_ray_workers_use_nsight(self,
                                           ray_remote_kwargs) -> Dict[str, Any]:
@@ -60,9 +77,24 @@ def _configure_ray_workers_use_nsight(self,
 
         return ray_remote_kwargs
 
+    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
+        if self.speculative_config is not None:
+            worker_module_name = "vllm.spec_decode.spec_decode_worker"
+            worker_class_name = "create_spec_worker"
+        else:
+            worker_module_name = "vllm.worker.worker"
+            worker_class_name = "Worker"
+
+        return dict(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if self.parallel_config.tensor_parallel_size == 1:
+        if (self.parallel_config.tensor_parallel_size == 1
+                and self.parallel_config.pipeline_parallel_size == 1):
             # For single GPU case, we use a ray worker with constrained memory.
             num_gpus = self.cache_config.gpu_memory_utilization
         else:
@@ -81,6 +113,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
+        worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
@@ -90,39 +123,28 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 placement_group_bundle_index=bundle_id,
             )
 
-            if self.speculative_config is not None:
-                worker_module_name = "vllm.spec_decode.spec_decode_worker"
-                worker_class_name = "create_spec_worker"
-            else:
-                worker_module_name = "vllm.worker.worker"
-                worker_class_name = "Worker"
-
             worker = ray.remote(
                 num_cpus=0,
                 num_gpus=num_gpus,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(
-                worker_module_name=worker_module_name,
-                worker_class_name=worker_class_name,
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
+            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
 
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    worker_module_name=worker_module_name,
-                    worker_class_name=worker_class_name,
-                    trust_remote_code=self.model_config.trust_remote_code,
-                )
-            else:
-                # Else, added to the list of workers.
+            if self.use_ray_spmd_worker:
                 self.workers.append(worker)
-
-        if self.driver_dummy_worker is None:
+            else:
+                worker_ip = ray.get(worker.get_node_ip.remote())
+                if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = RayWorkerWrapper(
+                        **worker_wrapper_kwargs)
+                else:
+                    # Else, added to the list of workers.
+                    self.workers.append(worker)
+
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
                 "Ray does not allocate any GPUs on the driver node. Consider "
                 "adjusting the Ray placement group or running the driver on a "
@@ -132,11 +154,32 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)
 
-        node_workers = defaultdict(list)
-        node_gpus = defaultdict(list)
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
+        # the order in `worker_node_and_gpu_ids` does not necessarily match
+        # the machine boundaries. We need to make sure that workers in the
+        # same node are assigned consecutive ranks.
+        # examples:
+        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
+
+        # initialize worker ranks with -1 (unassigned)
+        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
+        current_rank = 0
+        while -1 in worker_ranks:
+            # whenever we find an unassigned worker, find the node
+            index = worker_ranks.index(-1)
+            current_node_id = worker_node_and_gpu_ids[index][0]
+            # assign ranks to all workers in the same node
+            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+                if node_id == current_node_id:
+                    worker_ranks[i] = current_rank
+                    current_rank += 1
+        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
+                                                   worker_node_and_gpu_ids):
+            node_workers[node_id].append(worker_rank)
             # `gpu_ids` can be a list of strings or integers.
             # convert them to integers for consistency.
             # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
@@ -174,13 +217,16 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
+        error_on_invalid_device_count_status()
+
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(
                 local_rank=node_workers[node_id].index(rank),
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+            ) for rank, (node_id,
+                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
         ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
@@ -189,143 +235,243 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        # Enforce rank order for correct rank to return final output.
+        for rank, worker in sorted(zip(worker_ranks[1:], self.workers)):
+            # We need to skip the driver worker, which we
+            # do by skipping worker_ranks[0] which is always 0.
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
+
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
         loop running in each of the remote workers.
         """
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
         return self.driver_worker.execute_method("execute_model",
                                                  execute_model_req)
 
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return super().execute_model(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        outputs = ray.get(self.forward_dag.execute(execute_model_req))
+        return outputs[0]
+
     def _run_workers(
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers. Can be used in the following
         ways:
 
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
         - all_args/all_kwargs: args/kwargs for each worker are specified
           individually
         """
+        if self.use_ray_spmd_worker:
+            assert not async_run_tensor_parallel_workers_only, (
+                "async_run_tensor_parallel_workers_only is not supported for "
+                "spmd mode.")
 
         if max_concurrent_workers:
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        count = len(self.workers)
+        count = len(self.workers) if not \
+            async_run_tensor_parallel_workers_only \
+            else len(self.non_driver_workers)
+        # If using SPMD worker, all workers are the same, so we should execute
+        # the args on all workers. Otherwise, we skip the first worker's args
+        # because those args will go to the driver worker.
+        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
         all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
+            else islice(all_args, first_worker_args_index, None)
         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
-
-        if use_ray_compiled_dag:
-            # Right now, compiled DAG can only accept a single
-            # input. TODO(sang): Fix it.
-            assert self.forward_dag is not None
-            output_channels = self.forward_dag.execute(1)
-            ray_worker_outputs = []
-        else:
-            # Start the ray workers first.
-            ray_worker_outputs = [
-                worker.execute_method.remote(method, *worker_args,
-                                             **worker_kwargs)
-                for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-            ]
+            else islice(all_kwargs, first_worker_args_index, None)
+
+        # Start the ray workers first.
+        ray_workers = self.workers
+        if async_run_tensor_parallel_workers_only:
+            ray_workers = self.non_driver_workers
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
+        ]
 
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
             # Just return futures
             return ray_worker_outputs
 
-        driver_args = args if all_args is None else all_args[0]
-        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+        driver_worker_output = []
+        # In SPMD mode, the driver worker is the same as any other worker,
+        # so we only explicitly execute on the driver worker if using a
+        # non-SPMD worker class.
+        if not self.use_ray_spmd_worker:
+            driver_args = args if all_args is None else all_args[0]
+            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+            # Start the driver worker after all the ray workers.
+            if not use_dummy_driver:
+                driver_worker_output = [
+                    self.driver_worker.execute_method(method, *driver_args,
+                                                      **driver_kwargs)
+                ]
+            else:
+                assert self.driver_dummy_worker is not None
+                driver_worker_output = [
+                    ray.get(
+                        self.driver_dummy_worker.execute_method.remote(
+                            method, *driver_args, **driver_kwargs))
+                ]
 
-        # Start the driver worker after all the ray workers.
-        if not use_dummy_driver:
-            driver_worker_output = self.driver_worker.execute_method(
-                method, *driver_args, **driver_kwargs)
-        else:
-            assert self.driver_dummy_worker is not None
-            driver_worker_output = ray.get(
-                self.driver_dummy_worker.execute_method.remote(
-                    method, *driver_args, **driver_kwargs))
         # Get the results of the ray workers.
         if self.workers:
-            if use_ray_compiled_dag:
-                try:
-                    ray_worker_outputs = [
-                        pickle.loads(chan.begin_read())
-                        for chan in output_channels
-                    ]
-                finally:
-                    # Has to call end_read in order to reuse the DAG.
-                    for chan in output_channels:
-                        chan.end_read()
-            else:
-                ray_worker_outputs = ray.get(ray_worker_outputs)
+            ray_worker_outputs = ray.get(ray_worker_outputs)
 
-        return [driver_worker_output] + ray_worker_outputs
+        return driver_worker_output + ray_worker_outputs
 
     def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         """Wait for futures returned from _run_workers() with
         async_run_remote_workers_only to complete."""
         ray.get(parallel_worker_tasks)
 
-    def _compiled_ray_dag(self):
+    def _compiled_ray_dag(self, enable_asyncio: bool):
         import pkg_resources
-        required_version = "2.9"
-        current_version = pkg_resources.get_distribution("ray").version
+        from packaging import version
+
+        required_version = version.parse("2.32")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} or greater is "
                              f"required, but found {current_version}")
 
         from ray.dag import InputNode, MultiOutputNode
-        assert self.parallel_config.distributed_executor_backend == "ray"
+        assert self.parallel_config.use_ray
 
         # Right now, compiled DAG requires at least 1 arg. We send
         # a dummy value for now. It will be fixed soon.
         with InputNode() as input_data:
             forward_dag = MultiOutputNode([
-                worker.execute_model_compiled_dag_remote.
-                bind(  # type: ignore[attr-defined]
+                worker.execute_model_spmd.bind(  # type: ignore[attr-defined]
                     input_data) for worker in self.workers
             ])
-        return forward_dag.experimental_compile()
+        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+
+    def __del__(self):
+        if self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
 
 
 class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return await super().execute_model_async(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+
+        dag_future = await self.forward_dag.execute_async(execute_model_req)
+        outputs = await dag_future
+        return outputs[0]
 
     async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        if not self.tp_driver_workers:
+            return await self.driver_exec_method("execute_model",
+                                                 execute_model_req)
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
 
     async def _start_worker_execution_loop(self):
+        assert not self.use_ray_spmd_worker, (
+            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
         coros = [
             worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
+            for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
+
+    def __del__(self):
+        if self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 495fddd175dd4..fcbfa30d7a38a 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,8 +1,8 @@
-import pickle
 from typing import List, Optional, Tuple
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import get_ip, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -31,25 +31,39 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             gpu_ids = ray.get_gpu_ids()
             return node_id, gpu_ids
 
-        def execute_model_compiled_dag_remote(self, ignored):
-            """Used only when compiled DAG is enabled."""
+        def execute_model_spmd(self, execute_model_req: ExecuteModelRequest):
+            """Used only when SPMD worker and compiled DAG are both
+            enabled."""
+            # TODO(swang): This is needed right now because Ray aDAG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
             import torch
             if not self.compiled_dag_cuda_device_set:
                 torch.cuda.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
-            output = self.worker.execute_model()
-            output = pickle.dumps(output)
-            return output
+            return self.worker._execute_model_spmd(execute_model_req)
+
+    ray_import_err = None
 
 except ImportError as e:
-    logger.warning(
-        "Failed to import Ray with %r. For multi-node inference, "
-        "please install Ray with `pip install ray`.", e)
     ray = None  # type: ignore
+    ray_import_err = e
     RayWorkerWrapper = None  # type: ignore
 
 
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
+
+
 def initialize_ray_cluster(
     parallel_config: ParallelConfig,
     ray_address: Optional[str] = None,
@@ -65,10 +79,7 @@ def initialize_ray_cluster(
         ray_address: The address of the Ray cluster. If None, uses
             the default Ray cluster address.
     """
-    if ray is None:
-        raise ImportError(
-            "Ray is not installed. Please install Ray to use multi-node "
-            "serving.")
+    assert_ray_available()
 
     # Connect to a ray cluster.
     if is_hip() or is_xpu():
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
index dd7c82289341e..bdd8ba9032766 100644
--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
@@ -1,14 +1,15 @@
 import asyncio
 import os
-import pickle
 from collections import defaultdict
 from itertools import islice, repeat
 from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
                     Tuple, Union)
 
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
@@ -29,11 +30,13 @@
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
 
 
 class RayXPUExecutor(DistributedGPUExecutor):
 
+    uses_ray: bool = True
+
     def __init__(
         self,
         model_config: ModelConfig,
@@ -43,7 +46,8 @@ def __init__(
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -57,7 +61,8 @@ def __init__(
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
 
         placement_group = self.parallel_config.placement_group
 
@@ -69,10 +74,9 @@ def __init__(
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        # Profile the memory usage and initialize the cache.
         self.forward_dag = None
         if USE_RAY_COMPILED_DAG:
-            self.forward_dag = self._compiled_ray_dag()
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
 
         # This is non-None when the execute model loop is running
         # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
@@ -105,6 +109,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         return num_gpu_blocks, num_cpu_blocks
 
+    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
+        return dict(
+            worker_module_name="vllm.worker.xpu_worker",
+            worker_class_name="XPUWorker",
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
         if self.parallel_config.tensor_parallel_size == 1:
@@ -122,6 +133,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
+        worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
@@ -135,22 +147,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 num_gpus=num_gpus,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(
-                worker_module_name="vllm.worker.xpu_worker",
-                worker_class_name="XPUWorker",
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
+            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
 
             worker_ip = ray.get(worker.get_node_ip.remote())
             if worker_ip == driver_ip and self.driver_dummy_worker is None:
                 # If the worker is on the same node as the driver, we use it
                 # as the resource holder for the driver process.
                 self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    worker_module_name="vllm.worker.xpu_worker",
-                    worker_class_name="XPUWorker",
-                    trust_remote_code=self.model_config.trust_remote_code,
-                )
+                self.driver_worker = RayWorkerWrapper(**worker_wrapper_kwargs)
             else:
                 # Else, added to the list of workers.
                 self.workers.append(worker)
@@ -199,7 +203,7 @@ def collect_arg_helper_func(**kwargs):
                     rank=rank,
                     distributed_init_method=distributed_init_method,
                     lora_config=self.lora_config,
-                    vision_language_config=self.vision_language_config,
+                    multimodal_config=self.multimodal_config,
                     is_driver_worker=rank == 0,
                 ))
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
@@ -267,7 +271,6 @@ def _run_workers(
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers. Can be used in the following
@@ -290,26 +293,20 @@ def _run_workers(
         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
             else islice(all_kwargs, 1, None)
 
-        if use_ray_compiled_dag:
-            # Right now, compiled DAG can only accept a single
-            # input. TODO(sang): Fix it.
-            assert self.forward_dag is not None
-            output_channels = self.forward_dag.execute(1)
-        else:
-            # Start the ray workers first.
-            ray_worker_outputs = [
-                worker.execute_method.remote(method, *worker_args,
-                                             **worker_kwargs)
-                for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-            ]
+        # Start the ray workers first.
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+        ]
+
         if async_run_remote_workers_only:
             # Just return futures
             return ray_worker_outputs
 
+        driver_worker_output = []
         driver_args = args if all_args is None else all_args[0]
         driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
         # Start the driver worker after all the ray workers.
         if not use_dummy_driver:
             driver_worker_output = self.driver_worker.execute_method(
@@ -321,36 +318,28 @@ def _run_workers(
                     method, *driver_args, **driver_kwargs))
         # Get the results of the ray workers.
         if self.workers:
-            if use_ray_compiled_dag:
-                try:
-                    ray_worker_outputs = [
-                        pickle.loads(chan.begin_read())
-                        for chan in output_channels
-                    ]
-                finally:
-                    # Has to call end_read in order to reuse the DAG.
-                    for chan in output_channels:
-                        chan.end_read()
-            else:
-                ray_worker_outputs = ray.get(ray_worker_outputs)
+            ray_worker_outputs = ray.get(ray_worker_outputs)
 
-        return [driver_worker_output] + ray_worker_outputs
+        return driver_worker_output + ray_worker_outputs
 
     def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         """Wait for futures returned from _run_workers() with
         async_run_remote_workers_only to complete."""
         ray.get(parallel_worker_tasks)
 
-    def _compiled_ray_dag(self):
+    def _compiled_ray_dag(self, enable_asyncio: bool):
         import pkg_resources
-        required_version = "2.9"
-        current_version = pkg_resources.get_distribution("ray").version
+        from packaging import version
+
+        required_version = version.parse("2.32")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} or greater is "
                              f"required, but found {current_version}")
 
         from ray.dag import InputNode, MultiOutputNode
-        assert self.parallel_config.worker_use_ray
+        assert self.parallel_config.use_ray
 
         # Right now, compiled DAG requires at least 1 arg. We send
         # a dummy value for now. It will be fixed soon.
@@ -360,7 +349,7 @@ def _compiled_ray_dag(self):
                 bind(  # type: ignore[attr-defined]
                     input_data) for worker in self.workers
             ])
-        return forward_dag.experimental_compile()
+        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
 
     def check_health(self) -> None:
         """Raises an error if engine is unhealthy."""
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 7061ad85f88c0..1b5bb5c755ef2 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 
@@ -14,6 +14,8 @@
 
 class TPUExecutor(ExecutorBase):
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         assert not self.scheduler_config.chunked_prefill_enabled, (
             "Chunked prefill is not yet supported for TPU backend")
@@ -26,29 +28,45 @@ def _init_executor(self) -> None:
             self.model_config.dtype = torch.bfloat16
 
         # Instantiate the worker and load the model to the device.
-        self._init_worker()
-
-    def _init_worker(self):
-        from vllm.worker.tpu_worker import TPUWorker
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
 
-        assert self.parallel_config.world_size == 1, (
-            "TPUExecutor currently only supports a single TPU chip.")
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = TPUWorker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            self.cache_config,
-            self.load_config,
-            self.vision_language_config,
-            local_rank=0,
-            rank=0,
+    def _get_worker_kwargs(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=local_rank,
+            rank=rank,
             distributed_init_method=distributed_init_method,
+            multimodal_config=self.multimodal_config,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
+                                                     distributed_init_method))
+        return worker
 
     def initialize_cache(
         self,
@@ -65,8 +83,7 @@ def initialize_cache(
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
+        underlying worker."""
         return self.driver_worker.determine_num_available_blocks()
 
     def execute_model(
@@ -77,13 +94,36 @@ def execute_model(
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
 
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
 
     def list_loras(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
 
     def check_health(self) -> None:
         # TPUExecutor will always be healthy as long as it's running.
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index d37200bd02de3..9feae6a05ba9b 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -3,8 +3,9 @@
 import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
@@ -17,6 +18,8 @@
 
 class XPUExecutor(GPUExecutor):
 
+    uses_ray: bool = False
+
     def __init__(
         self,
         model_config: ModelConfig,
@@ -26,7 +29,8 @@ def __init__(
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -42,7 +46,8 @@ def __init__(
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.speculative_config = None
 
         # Instantiate the worker and load the model to GPU.
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
new file mode 100644
index 0000000000000..b13d9acf93d3b
--- /dev/null
+++ b/vllm/inputs/__init__.py
@@ -0,0 +1,18 @@
+from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
+                   TextPrompt, TokensPrompt, parse_and_batch_prompt)
+from .registry import InputContext, InputRegistry
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
+
+__all__ = [
+    "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
+    "TokensPrompt", "PromptInputs", "LLMInputs", "INPUT_REGISTRY",
+    "InputContext", "InputRegistry"
+]
diff --git a/vllm/inputs.py b/vllm/inputs/data.py
similarity index 76%
rename from vllm/inputs.py
rename to vllm/inputs/data.py
index 026903e19a26e..4443e6c70fe5b 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict
 
 
 class ParsedText(TypedDict):
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -85,33 +85,14 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """A list of token IDs to pass to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
 
-class TextTokensPrompt(TypedDict):
-    """It is assumed that :attr:`prompt` is consistent with
-    :attr:`prompt_token_ids`. This is currently used in
-    :class:`AsyncLLMEngine` for logging both the text and token IDs."""
-
-    prompt: str
-    """The prompt text."""
-
-    prompt_token_ids: List[int]
-    """The token IDs of the prompt. If None, we use the
-    tokenizer to convert the prompts to token IDs."""
-
-    multi_modal_data: NotRequired["MultiModalData"]
-    """
-    Optional multi-modal data to pass to the model,
-    if the model supports it.
-    """
-
-
-PromptStrictInputs = Union[str, TextPrompt, TokensPrompt]
+PromptInputs = Union[str, TextPrompt, TokensPrompt]
 """
 The inputs to the LLM, which can take one of the following forms:
 
@@ -119,12 +100,22 @@ class TextTokensPrompt(TypedDict):
 - A tokenized prompt (:class:`TokensPrompt`)
 """
 
-PromptInputs = Union[str, TextPrompt, TokensPrompt, TextTokensPrompt]
-"""Same as :const:`PromptStrictInputs` but additionally accepts
-:class:`TextTokensPrompt`."""
-
 
 class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+    """
     prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
     prompt: NotRequired[Optional[str]]
-    multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
+    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
new file mode 100644
index 0000000000000..4a7e5c5832917
--- /dev/null
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,209 @@
+import functools
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
+                    TypeVar)
+
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+
+from .data import LLMInputs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, MultiModalConfig
+    from vllm.multimodal import MultiModalDataDict
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+C = TypeVar("C", bound=PretrainedConfig)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: "ModelConfig"
+    """The configuration of the model."""
+
+    def get_multimodal_config(self) -> "MultiModalConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+
+        multimodal_config = self.model_config.multimodal_config
+        if multimodal_config is None:
+            raise ValueError("No multimodal config found")
+
+        return multimodal_config
+
+    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the model is not of the specified type.
+        """
+
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+
+N = TypeVar("N", bound=Type[nn.Module])
+
+DummyDataFactory = Callable[[InputContext, int],
+                            Tuple["SequenceData",
+                                  Optional["MultiModalDataDict"]]]
+"""
+Create dummy data to be inputted into the model.
+
+Note:
+    :data:`InputProcessor` is not applied to the dummy data.
+"""
+
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
+
+
+class InputRegistry:
+    """
+    A registry to dispatch data processing
+    according to the target model.
+    """
+
+    def __init__(self) -> None:
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+                                                  DummyDataFactory] = {}
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
+                                                   InputProcessor] = {}
+
+    def _default_dummy_data_factory(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+        """
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        dummy_seq_data = SequenceData([0] * seq_len)
+        dummy_multi_modal_data = None
+
+        return dummy_seq_data, dummy_multi_modal_data
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def dummy_data_for_profiling(self, model_config: "ModelConfig",
+                                 seq_len: int):
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`enabling_multimodal_inputs`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        dummy_factory = self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+        return dummy_factory(InputContext(model_config), seq_len)
+
+    def _default_input_processor(self, ctx: InputContext,
+                                 inputs: LLMInputs) -> LLMInputs:
+        """The default input processor is a no-op."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors_by_model_type:
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: LLMInputs) -> LLMInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        processor = self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
+
+        return processor(InputContext(model_config), inputs)
+
+    def create_input_processor(self, model_config: "ModelConfig"):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e4a23273f7282..40de134c0a5ee 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
+from vllm.adapter_commons.layers import AdapterMapping
 from vllm.config import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -134,15 +135,8 @@ def _apply_lora_packed_nslice(
 
 
 @dataclass
-class LoRAMapping:
-    # Per every token in input_ids:
-    index_mapping: Tuple[int, ...]
-    # Per sampled token:
-    prompt_mapping: Tuple[int, ...]
-
-    def __post_init__(self):
-        self.index_mapping = tuple(self.index_mapping)
-        self.prompt_mapping = tuple(self.prompt_mapping)
+class LoRAMapping(AdapterMapping):
+    pass
 
 
 class BaseLayerWithLoRA(nn.Module):
@@ -1069,6 +1063,10 @@ def vocab_size(self):
     def scale(self):
         return self.base_layer.scale
 
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+
     @property
     def org_vocab_size(self):
         return self.base_layer.org_vocab_size
@@ -1168,11 +1166,11 @@ def set_mapping(
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
+        logits = lm_head.linear_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 8f3c7f76932af..14081b5ba441c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -2,6 +2,7 @@
 from typing import Sequence as GenericSequence
 
 import torch
+import torch.types
 
 from vllm.utils import is_pin_memory_available
 
@@ -64,7 +65,7 @@ def create_dummy_lora_weights(
             output_dim: int,
             rank: int,
             dtype: torch.dtype,
-            device: torch.device,
+            device: torch.types.Device,
             embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros([input_dim, rank],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3e82856866d85..e1ede7d4d710a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,12 +4,17 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import safetensors.torch
 import torch
 from torch import nn
 
+from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
+                                         AdapterModelManager)
+from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
+                                        get_adapter, list_adapters,
+                                        remove_adapter, set_adapter_mapping)
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import (BaseLayerWithLoRA,
@@ -18,7 +23,8 @@
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
-from vllm.utils import LRUCache, is_pin_memory_available
+from vllm.model_executor.models.interfaces import SupportsLoRA
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -152,7 +158,7 @@ def get_lora_id():
     return _GLOBAL_LORA_ID
 
 
-class LoRAModel:
+class LoRAModel(AdapterModel):
     """A LoRA fine-tuned model."""
 
     def __init__(
@@ -302,25 +308,54 @@ def from_local_checkpoint(
                                                     "new_embeddings.bin")
         with open(lora_config_path) as f:
             config = json.load(f)
-        target_modules = config["target_modules"]
-        unexpected_modules = []
-        for module in target_modules:
-            # Compatible with more modules, such as:layers.11.self_attn.k_proj
-            part_name = module.split(".")[-1]
-            if part_name not in expected_lora_modules:
-                unexpected_modules.append(module)
-        # loaded lora's target modules must be a subset of expected_lora_modules
-
-        if unexpected_modules:
-            print(unexpected_modules, "modules")
-            raise ValueError(
-                f"While loading {lora_dir}, expected"
-                f" target modules in {expected_lora_modules}"
-                f" but received {unexpected_modules}."
-                f" Please verify that the loaded LoRA module is correct")
         if os.path.isfile(lora_tensor_path):
-            tensors = safetensors.torch.load_file(lora_tensor_path)
+            tensors: Dict[str, torch.Tensor] = {}
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path,
+                                       framework="pt") as f:  # type: ignore
+                for lora_module in f.keys():  # noqa
+                    module_name, _ = parse_fine_tuned_lora_name(lora_module)
+                    part_name = module_name.split(".")[-1]
+                    if part_name not in expected_lora_modules:
+                        unexpected_modules.append(module_name)
+                if unexpected_modules:
+                    raise ValueError(
+                        f"While loading {lora_dir}, expected"
+                        f" target modules in {expected_lora_modules}"
+                        f" but received {unexpected_modules}."
+                        f" Please verify that the loaded LoRA module is correct"
+                    )
+                # Load tensors if there are only expected modules.
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path):
+            # When a bin file is provided, we rely on config to find unexpected
+            # modules.
+            unexpected_modules = []
+            target_modules = config["target_modules"]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules:
+                print(unexpected_modules, "modules")
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
             tensors = torch.load(lora_bin_file_path)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
@@ -358,12 +393,12 @@ def from_local_checkpoint(
         )
 
 
-class LoRAModelManager:
+class LoRAModelManager(AdapterModelManager):
     """A manager that manages multiple LoRA-fine-tuned models."""
 
     def __init__(
         self,
-        model: nn.Module,
+        model: SupportsLoRA,
         max_num_seqs: int,
         max_num_batched_tokens: int,
         vocab_size: int,
@@ -410,8 +445,7 @@ def __init__(
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
         self.indices_len: List[Optional[int]] = [None] * 4
-
-        self.model: nn.Module = model
+        super().__init__(model)
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
                 self.model.supported_lora_modules)
@@ -423,12 +457,11 @@ def __init__(
                 self.model.packed_modules_mapping)
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
-        self._registered_loras: Dict[int, LoRAModel] = {}
         # Dict instead of a Set for compatibility with LRUCache.
-        self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
         self.model.lora_manager = self
+        self.adapter_type = 'LoRa'
 
     @property
     def capacity(self) -> int:
@@ -438,15 +471,16 @@ def capacity(self) -> int:
     def lora_slots(self) -> int:
         return self.lora_config.max_loras
 
-    def __len__(self) -> int:
-        return len(self._registered_loras)
+    @property
+    def adapter_slots(self) -> int:
+        return self.lora_slots
 
-    def activate_lora(
+    def activate_adapter(
         self,
         lora_id: int,
     ) -> bool:
         """Move LoRA into a GPU buffer to be used in the forward pass."""
-        if lora_id in self._active_loras:
+        if lora_id in self._active_adapters:
             return False
         first_free_slot = next(
             ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
@@ -454,8 +488,8 @@ def activate_lora(
         if first_free_slot is None:
             raise ValueError("No free lora slots")
         index, _ = first_free_slot
-        self._active_loras[lora_id] = None
-        lora_model = self._registered_loras[lora_id]
+        self._active_adapters[lora_id] = None
+        lora_model = self._registered_adapters[lora_id]
         logger.debug("Activating LoRA. int id: %d, slot index: %d",
                      lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
@@ -469,21 +503,13 @@ def activate_lora(
                 module.reset_lora(index)
         return True
 
-    def _deactivate_lora(self, lora_id: int):
+    def _deactivate_adapter(self, lora_id: int):
         try:
             index = self.lora_index_to_id.index(lora_id)
             self.lora_index_to_id[index] = None
         except ValueError:
             pass
 
-    def deactivate_lora(self, lora_id: int) -> bool:
-        """Remove a LoRA from a GPU buffer."""
-        if lora_id in self._active_loras:
-            self._deactivate_lora(lora_id)
-            self._active_loras.pop(lora_id)
-            return True
-        return False
-
     def _set_long_lora_context(self, lora: LoRAModel):
         if self.long_lora_context is None:
             return
@@ -499,34 +525,19 @@ def _set_long_lora_context(self, lora: LoRAModel):
         if offsets:
             self.long_lora_context.offsets_by_lora_id[lora.id] = offsets
 
-    def _add_lora(self, lora: LoRAModel):
+    def _add_adapter(self, lora: LoRAModel):
         self._create_merged_loras_inplace(lora)
-        self._registered_loras[lora.id] = lora
+        self._registered_adapters[lora.id] = lora
         self._set_long_lora_context(lora)
 
-    def add_lora(self, lora: LoRAModel) -> bool:
-        """Add a LoRAModel to the manager CPU cache."""
-        logger.debug(
-            "Adding lora. Model id: %d, "
-            "int id: %d, "
-            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
-        if lora.id not in self._registered_loras:
-            if len(self._registered_loras) >= self.capacity:
-                raise RuntimeError("No free LoRA slots.")
-            self._add_lora(lora)
-            return True
-        return False
-
-    def remove_lora(self, lora_id: int) -> bool:
-        """Remove a LoRAModel from the manager CPU cache."""
-        # TODO: should we check active lora?
-        self.deactivate_lora(lora_id)
-        if self.long_lora_context:
-            self.long_lora_context.offsets_by_lora_id.pop(lora_id, None)
-        return bool(self._registered_loras.pop(lora_id, None))
+    def pin_adapter(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in LoRAModelManager."
+            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
     # TODO see if this can be vectorized
-    def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
+    def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
         (base_indices, sampler_indices, sampler_indices_padded,
          embeddings_indices, long_lora_offsets_tensor,
          indices_len) = convert_mapping(mapping, self.lora_index_to_id,
@@ -548,23 +559,11 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         # Maintain the reference
         self.indices_len[:] = indices_len
 
-    def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
-        if self._last_mapping != lora_mapping:
-            self._set_lora_mapping(lora_mapping)
-        self._last_mapping = lora_mapping
-
-    def list_loras(self) -> Dict[int, LoRAModel]:
-        """List all registered LoRAModels."""
-        return dict(self._registered_loras)
-
-    def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
-        return self._registered_loras.get(lora_id, None)
-
-    def remove_all_loras(self):
+    def remove_all_adapters(self):
         """Remove all LoRAModels from the manager."""
-        self._registered_loras.clear()
+        self._registered_adapters.clear()
         self.lora_index_to_id = [None] * self.lora_slots
-        self._active_loras.clear()
+        self._active_adapters.clear()
 
     def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
@@ -708,18 +707,39 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
                 replacement_loras)
 
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        return deactivate_adapter(adapter_id, self._active_adapters,
+                                  self._deactivate_adapter)
+
+    def add_adapter(self, adapter: LoRAModel) -> bool:
+        logger.debug(
+            "Adding lora. Model id: %d, "
+            "int id: %d, "
+            "scaling factor: %s", adapter.id, adapter.id,
+            adapter.scaling_factor)
+        return add_adapter(adapter, self._registered_adapters, self.capacity,
+                           self._add_adapter)
+
+    def set_adapter_mapping(self, mapping: LoRAMapping) -> None:
+        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
+                                                 self._set_adapter_mapping)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return remove_adapter(adapter_id, self._registered_adapters,
+                              self.deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, Any]:
+        return list_adapters(self._registered_adapters)
+
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        return get_adapter(adapter_id, self._registered_adapters)
 
-class LoRALRUCache(LRUCache[LoRAModel]):
+
+class LoRALRUCache(AdapterLRUCache[LoRAModel]):
 
     def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
                                                                    bool]):
-        super().__init__(capacity)
-        self.deactivate_lora_fn = deactivate_lora_fn
-
-    def _on_remove(self, key: int, value: LoRAModel):
-        logger.debug("Removing LoRA. int id: %d", key)
-        self.deactivate_lora_fn(key)
-        return super()._on_remove(key, value)
+        super().__init__(capacity, deactivate_lora_fn)
 
 
 class LRUCacheLoRAModelManager(LoRAModelManager):
@@ -735,48 +755,68 @@ def __init__(
     ):
         super().__init__(model, max_num_seqs, max_num_batched_tokens,
                          vocab_size, lora_config)
-        self._registered_loras: LoRALRUCache = LoRALRUCache(
-            self.capacity, self.deactivate_lora)
-        self._active_loras: LoRALRUCache = LoRALRUCache(
-            self.lora_slots, self._deactivate_lora)
+        self._registered_adapters: LoRALRUCache = LoRALRUCache(
+            self.capacity, self.deactivate_adapter)
+        self._active_adapters: LoRALRUCache = LoRALRUCache(
+            self.lora_slots, self._deactivate_adapter)
 
-    def list_loras(self) -> Dict[int, LoRAModel]:
+    def list_adapters(self) -> Dict[int, LoRAModel]:
         """List all registered LoRAModels."""
-        return dict(self._registered_loras.cache)
+        return dict(self._registered_adapters.cache)
 
-    def add_lora(self, lora: LoRAModel) -> bool:
+    def add_adapter(self, lora: LoRAModel) -> bool:
         """Add a LoRAModel to the manager."""
         logger.debug(
             "Adding lora. Model id: %d, "
             "int id: %d, "
             "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
-        if lora.id not in self._registered_loras:
-            self._add_lora(lora)
+        if lora.id not in self._registered_adapters:
+            self._add_adapter(lora)
             was_added = True
         else:
             # We always touch to update the LRU cache order
-            self._registered_loras.touch(lora.id)
+            self._registered_adapters.touch(lora.id)
             was_added = False
         return was_added
 
-    def activate_lora(
+    def activate_adapter(
         self,
         lora_id: int,
     ) -> bool:
-        if lora_id not in self._active_loras and len(
-                self._active_loras) >= self.lora_slots:
-            self._active_loras.remove_oldest()
-        result = super().activate_lora(lora_id)
+        if lora_id not in self._active_adapters and len(
+                self._active_adapters) >= self.lora_slots:
+            self._active_adapters.remove_oldest()
+        result = super().activate_adapter(lora_id)
         # We always touch to update the LRU cache order
-        self._active_loras.touch(lora_id)
+        self._active_adapters.touch(lora_id)
         return result
 
-    def remove_oldest_lora(self) -> bool:
-        if len(self._registered_loras) > 0:
-            self._registered_loras.remove_oldest()
+    def remove_oldest_adapter(self) -> bool:
+        if len(self._registered_adapters) > 0:
+            self._registered_adapters.remove_oldest()
             return True
         return False
 
+    def pin_adapter(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        self._pin_lora_in_cpu_cache(lora_id)
+        self._pin_lora_in_gpu_cache(lora_id)
+        return True
+
+    def _pin_lora_in_cpu_cache(self, lora_id: int):
+        try:
+            self._registered_adapters.pin(lora_id)
+        except ValueError as err:
+            raise ValueError("Pinning failed. "
+                             f"LoRA {lora_id} is not registered.") from err
+
+    def _pin_lora_in_gpu_cache(self, lora_id: int):
+        if lora_id not in self._active_adapters:
+            # move lora to gpu if not already active
+            self.activate_adapter(lora_id)
+
+        self._active_adapters.pin(lora_id)
+
 
 def create_lora_manager(
         model: nn.Module,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 7ecaa450f1758..64f87a4b2c69d 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -5,13 +5,14 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 
 
 def _check_punica_support():
     if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
         return
 
-    if torch.cuda.get_device_capability() < (8, 0):
+    if current_platform.get_device_capability() < (8, 0):
         raise ImportError(
             "punica LoRA kernels require compute capability >= 8.0")
     else:
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 662774ffe09ae..5d791424fbe6e 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,13 +1,16 @@
-from dataclasses import dataclass
+import warnings
+from dataclasses import dataclass, field
 from typing import Optional
 
+from vllm.adapter_commons.request import AdapterRequest
+
 
 @dataclass
-class LoRARequest:
+class LoRARequest(AdapterRequest):
     """
     Request for a LoRA adapter.
 
-    Note that this class should be be used internally. For online
+    Note that this class should be used internally. For online
     serving, it is recommended to not allow users to use this class but
     instead provide another layer of abstraction to prevent users from
     accessing unauthorized LoRA adapters.
@@ -18,17 +21,53 @@ class LoRARequest:
 
     lora_name: str
     lora_int_id: int
-    lora_local_path: str
+    lora_path: str = ""
+    lora_local_path: Optional[str] = field(default=None, repr=False)
     long_lora_max_len: Optional[int] = None
+    __hash__ = AdapterRequest.__hash__
 
     def __post_init__(self):
-        if self.lora_int_id < 1:
-            raise ValueError(
-                f"lora_int_id must be > 0, got {self.lora_int_id}")
+        if 'lora_local_path' in self.__dict__:
+            warnings.warn(
+                "The 'lora_local_path' attribute is deprecated "
+                "and will be removed in a future version. "
+                "Please use 'lora_path' instead.",
+                DeprecationWarning,
+                stacklevel=2)
+            if not self.lora_path:
+                self.lora_path = self.lora_local_path or ""
 
-    def __eq__(self, value: object) -> bool:
-        return isinstance(
-            value, LoRARequest) and self.lora_int_id == value.lora_int_id
+        # Ensure lora_path is not empty
+        assert self.lora_path, "lora_path cannot be empty"
 
-    def __hash__(self) -> int:
+    @property
+    def adapter_id(self):
         return self.lora_int_id
+
+    @property
+    def name(self):
+        return self.lora_name
+
+    @property
+    def path(self):
+        return self.lora_path
+
+    @property
+    def local_path(self):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        return self.lora_path
+
+    @local_path.setter
+    def local_path(self, value):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        self.lora_path = value
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index ab3b99eee6fc1..4513337299e16 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,5 +1,9 @@
+import os
 from typing import List, Optional, Set, Tuple, Type
 
+import huggingface_hub
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   HFValidationError, RepositoryNotFoundError)
 from torch import nn
 from transformers import PretrainedConfig
 
@@ -105,3 +109,46 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
             return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
 
     raise ValueError(f"{name} is unsupported LoRA weight")
+
+
+def get_adapter_absolute_path(lora_path: str) -> str:
+    """
+    Resolves the given lora_path to an absolute local path.
+
+    If the lora_path is identified as a Hugging Face model identifier,
+    it will download the model and return the local snapshot path.
+    Otherwise, it treats the lora_path as a local file path and
+    converts it to an absolute path.
+
+    Parameters:
+    lora_path (str): The path to the lora model, which can be an absolute path,
+                     a relative path, or a Hugging Face model identifier.
+
+    Returns:
+    str: The resolved absolute local path to the lora model.
+    """
+
+    # Check if the path is an absolute path. Return it no matter exists or not.
+    if os.path.isabs(lora_path):
+        return lora_path
+
+    # If the path starts with ~, expand the user home directory.
+    if lora_path.startswith('~'):
+        return os.path.expanduser(lora_path)
+
+    # Check if the expanded relative path exists locally.
+    if os.path.exists(lora_path):
+        return os.path.abspath(lora_path)
+
+    # If the path does not exist locally, assume it's a Hugging Face repo.
+    try:
+        local_snapshot_path = huggingface_hub.snapshot_download(
+            repo_id=lora_path)
+    except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
+            HFValidationError):
+        # Handle errors that may occur during the download
+        # Return original path instead instead of throwing error here
+        logger.exception("Error downloading the HuggingFace model")
+        return lora_path
+
+    return local_snapshot_path
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 498b2b9ddb18a..724c308a07a27 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,92 +1,30 @@
-from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
 import torch
 
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
 from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
 
 logger = init_logger(__name__)
 
 
-class AbstractWorkerLoRAManager(ABC):
-    """Abstract class for managing LoRA models on the worker side."""
-
-    def __init__(self,
-                 max_num_seqs: int,
-                 max_num_batched_tokens: int,
-                 vocab_size: int,
-                 lora_config: LoRAConfig,
-                 device: torch.device,
-                 max_position_embeddings: Optional[int] = None):
-        self.max_num_seqs = max_num_seqs
-        self.max_num_batched_tokens = max_num_batched_tokens
-        self.max_position_embeddings = max_position_embeddings
-        self.vocab_size = vocab_size
-        self.device = device
-        self.lora_config = lora_config
-
-        # If False, do not cache. If None, cache is empty.
-        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
-
-    @contextmanager
-    def dummy_lora_cache(self):
-        """Use this context manager to reuse the dummy lora model
-        to avoid creating it repeatedly."""
-        self._cached_dummy_lora = None
-        yield
-        self._cached_dummy_lora = False
-
-    @property
-    @abstractmethod
-    def is_enabled(self) -> bool:
-        ...
-
-    @abstractmethod
-    def create_lora_manager(
-        self,
-        model: torch.nn.Module,
-    ) -> Any:
-        ...
-
-    @abstractmethod
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        ...
-
-    @abstractmethod
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        ...
-
-    @abstractmethod
-    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
-        ...
-
-    @abstractmethod
-    def remove_lora(self, lora_id: int) -> bool:
-        ...
-
-    @abstractmethod
-    def remove_all_loras(self):
-        ...
-
-    @abstractmethod
-    def list_loras(self) -> Set[int]:
-        ...
-
-
-class WorkerLoRAManager(AbstractWorkerLoRAManager):
+class WorkerLoRAManager(AbstractWorkerManager):
     """WorkerLoRAManager that manages LoRA models on the worker side.
 
     Every request, the requested LoRAs will be loaded (unless they are already
     loaded), and every other LoRA will be unloaded."""
 
-    _lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager
+    _manager_cls: Type[LoRAModelManager] = LoRAModelManager
 
     def __init__(
         self,
@@ -103,16 +41,23 @@ def __init__(
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
+        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.vocab_size = vocab_size
+        self.lora_config = lora_config
+        self.max_position_embeddings = max_position_embeddings
+        super().__init__(device)
         # Lazily initialized by create_lora_manager.
-        self._lora_manager: LoRAModelManager
-        super().__init__(
-            max_num_seqs,
-            max_num_batched_tokens,
-            vocab_size,
-            lora_config,
-            device,
-            max_position_embeddings=max_position_embeddings,
-        )
+        self._adapter_manager: LoRAModelManager
+
+    @contextmanager
+    def dummy_lora_cache(self):
+        """Use this context manager to reuse the dummy lora model
+        to avoid creating it repeatedly."""
+        self._cached_dummy_lora = None
+        yield
+        self._cached_dummy_lora = False
 
     @property
     def is_enabled(self) -> bool:
@@ -128,41 +73,14 @@ def create_lora_manager(
             max_num_batched_tokens=self.max_num_batched_tokens,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
-            lora_manager_cls=self._lora_manager_cls,
+            lora_manager_cls=self._manager_cls,
         )
-        self._lora_manager = lora_manager
+        self._adapter_manager = lora_manager
         return lora_manager.model
 
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        self._apply_loras(lora_requests)
-        self._lora_manager.set_lora_mapping(lora_mapping)
-
-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
-        loras_that_exist = self.list_loras()
-        loras_map = {
-            lora_request.lora_int_id: lora_request
-            for lora_request in lora_requests if lora_request
-        }
-        if len(loras_map) > self._lora_manager.lora_slots:
-            raise RuntimeError(
-                f"Number of requested LoRAs ({len(loras_map)}) is greater "
-                "than the number of GPU LoRA slots "
-                f"({self._lora_manager.lora_slots}).")
-
-        new_loras = set(loras_map)
-        loras_to_add = new_loras - loras_that_exist
-        loras_to_remove = loras_that_exist - new_loras
-
-        for lora_id in loras_to_remove:
-            self.remove_lora(lora_id)
-
-        for lora_id in loras_to_add:
-            self.add_lora(loras_map[lora_id])
-
-    def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
+    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
-            model = self._lora_manager.model
+            model = self._adapter_manager.model
             supported_lora_modules = model.supported_lora_modules
             packed_modules_mapping = model.packed_modules_mapping
             expected_lora_modules: List[str] = []
@@ -172,8 +90,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
                         packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
+            lora_path = get_adapter_absolute_path(lora_request.lora_path)
             lora = self._lora_model_cls.from_local_checkpoint(
-                lora_request.lora_local_path,
+                lora_path,
                 expected_lora_modules,
                 max_position_embeddings=self.max_position_embeddings,
                 lora_model_id=lora_request.lora_int_id,
@@ -185,8 +104,7 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
                 embedding_padding_modules=self.embedding_padding_modules,
             )
         except Exception as e:
-            raise RuntimeError(
-                f"Loading lora {lora_request.lora_local_path} failed") from e
+            raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
             raise ValueError(
                 f"LoRA rank {lora.rank} is greater than max_lora_rank "
@@ -198,34 +116,45 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
-        if lora_request.lora_int_id in self.list_loras():
+        if lora_request.lora_int_id in self.list_adapters():
             return False
         if isinstance(self._cached_dummy_lora, LoRAModel):
             dummy_lora = self._cached_dummy_lora.clone(
                 lora_request.lora_int_id)
         else:
-            dummy_lora = self._lora_manager.create_dummy_lora(
+            dummy_lora = self._adapter_manager.create_dummy_lora(
                 lora_request.lora_int_id, rank, 1, self.embedding_modules)
             if self._cached_dummy_lora is None:
                 self._cached_dummy_lora = dummy_lora
-        return self._lora_manager.add_lora(dummy_lora)
+        return self._adapter_manager.add_adapter(dummy_lora)
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if lora_request.lora_int_id in self.list_loras():
-            return False
-        lora = self._load_lora(lora_request)
-        loaded = self._lora_manager.add_lora(lora)
-        self._lora_manager.activate_lora(lora.id)
-        return loaded
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+
+    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
+
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
 
-    def remove_lora(self, lora_id: int) -> bool:
-        return self._lora_manager.remove_lora(lora_id)
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
 
-    def remove_all_loras(self):
-        self._lora_manager.remove_all_loras()
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
 
-    def list_loras(self) -> Set[int]:
-        return set(self._lora_manager.list_loras())
+    def list_adapters(self) -> Set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
 
 
 class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
@@ -235,8 +164,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     (unless they are already loaded) and least recently used LoRAs will
     be unloaded if the cache is above capacity."""
 
-    _lora_manager_cls: Type[
-        LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
+    _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
 
     def create_lora_manager(
         self,
@@ -244,40 +172,41 @@ def create_lora_manager(
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
-            lora_manager_cls=self._lora_manager_cls,
+            lora_manager_cls=self._manager_cls,
             max_num_seqs=self.max_num_seqs,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
             max_num_batched_tokens=self.max_num_batched_tokens,
         )
-        self._lora_manager = lora_manager
+        self._adapter_manager = lora_manager
         return lora_manager.model
 
-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
+    def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
         loras_map = {
             lora_request.lora_int_id: lora_request
             for lora_request in lora_requests if lora_request
         }
-        if len(loras_map) > self._lora_manager.lora_slots:
+        if len(loras_map) > self._adapter_manager.lora_slots:
             raise RuntimeError(
                 f"Number of requested LoRAs ({len(loras_map)}) is greater "
                 "than the number of GPU LoRA slots "
-                f"({self._lora_manager.lora_slots}).")
+                f"({self._adapter_manager.lora_slots}).")
         for lora in loras_map.values():
-            self.add_lora(lora)
+            self.add_adapter(lora)
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if lora_request.lora_int_id not in self.list_loras():
+    def add_adapter(self, lora_request: LoRARequest) -> bool:
+        if lora_request.lora_int_id not in self.list_adapters():
             # Remove before we load the new lora to save memory
-            if len(self._lora_manager) + 1 > self._lora_manager.capacity:
-                assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
-                self._lora_manager.remove_oldest_lora()
-            lora = self._load_lora(lora_request)
-            loaded = self._lora_manager.add_lora(lora)
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                assert isinstance(self._adapter_manager,
+                                  LRUCacheLoRAModelManager)
+                self._adapter_manager.remove_oldest_adapter()
+            lora = self._load_adapter(lora_request)
+            loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
             # update its position in the caches
-            loaded = self._lora_manager.get_lora(
+            loaded = self._adapter_manager.get_adapter(
                 lora_request.lora_int_id) is not None
-        self._lora_manager.activate_lora(lora_request.lora_int_id)
+        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
         return loaded
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1618705ff2983..1c8f6cccb3e9a 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,6 +21,7 @@
 from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
+from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
@@ -67,7 +68,7 @@ def __call__(self, input_ids: List[int],
 class RegexLogitsProcessor(BaseLogitsProcessor):
 
     @classmethod
-    @lru_cache(maxsize=32)
+    @cache()
     def _get_guide(cls, regex_string: str,
                    tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
@@ -126,7 +127,7 @@ def __init__(self, schema: Union[str, Dict, BaseModel],
 class CFGLogitsProcessor(BaseLogitsProcessor):
 
     @classmethod
-    @lru_cache(maxsize=32)
+    @cache()
     def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
         return CFGGuide(cfg, tokenizer)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 2926c7d1c8a76..db837231c6ace 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,9 +1,14 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name)
+    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
 
 __all__ = [
     "fused_moe",
     "fused_topk",
     "fused_experts",
     "get_config_file_name",
+    "grouped_topk",
+    "FusedMoE",
+    "FusedMoEMethodBase",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 93472eb08a462..6a976788f9b10 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index 5bd9d71e8f9bb..0a46390b2e31b 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,110 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
-        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 02e66280c1a3a..91011e64c7de4 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 0
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 34c3b593d9799..f807d4a5abaed 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 1,
-        "num_stages": 1
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4d0160ff296a0..413c0b6d0924e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,6 +8,7 @@
 import triton
 import triton.language as tl
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 
@@ -331,6 +332,31 @@ def get_default_config(
     return config
 
 
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+):
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        configs = get_moe_configs(E, N, dtype)
+
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype)
+    return config
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -367,6 +393,39 @@ def fused_topk(
     return topk_weights, topk_ids
 
 
+# This is used by the Deepseek-V2 model
+def grouped_topk(hidden_states: torch.Tensor,
+                 gating_output: torch.Tensor,
+                 topk: int,
+                 renormalize: bool,
+                 num_expert_group: int = 0,
+                 topk_group: int = 0):
+
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = scores.view(num_token, num_expert_group,
+                               -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores,
+                                        k=topk,
+                                        dim=-1,
+                                        sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -389,25 +448,23 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.float32, torch.float16, torch.bfloat16
     ]
 
-    M, _ = hidden_states.shape
+    num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+    )
 
-    if override_config:
-        config = override_config
-    else:
-        # First try to load optimal config from the file
-        configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if use_fp8 else None)
-
-        if configs:
-            # If an optimal configuration map has been found, look up the
-            # optimal config
-            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
-        else:
-            # Else use the default config
-            config = get_default_config(M, E, N, w1.shape[2],
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None)
+    config = get_config_func(M)
 
     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                       device=hidden_states.device,
@@ -419,51 +476,78 @@ def fused_experts(hidden_states: torch.Tensor,
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, config['BLOCK_SIZE_M'], E)
     compute_type = (tl.bfloat16
                     if hidden_states.dtype == torch.bfloat16 else tl.float16)
 
-    invoke_fused_moe_kernel(hidden_states,
-                            w1,
-                            intermediate_cache1,
-                            a1_scale,
-                            w1_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            False,
-                            topk_ids.shape[1],
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
-
-    invoke_fused_moe_kernel(intermediate_cache2,
-                            w2,
-                            intermediate_cache3,
-                            a2_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            True,
-                            1,
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
     if inplace:
-        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                         dim=1,
-                         out=hidden_states)
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], E))
+
+        invoke_fused_moe_kernel(curr_hidden_states,
+                                w1,
+                                intermediate_cache1,
+                                a1_scale,
+                                w1_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                False,
+                                topk_ids.shape[1],
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+
+        invoke_fused_moe_kernel(intermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2_scale,
+                                w2_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                True,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                  dim=1,
+                  out=out_hidden_states[begin_chunk_idx:end_chunk_idx])
+    return out_hidden_states
 
 
 def fused_moe(
@@ -475,6 +559,9 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
@@ -497,6 +584,10 @@ def fused_moe(
         Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
     - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -510,8 +601,15 @@ def fused_moe(
     # Check constraints.
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
+                                              topk, renormalize,
+                                              num_expert_group, topk_group)
+    else:
+        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                            renormalize)
+
     return fused_experts(hidden_states,
                          w1,
                          w2,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
new file mode 100644
index 0000000000000..a0dc4c94744a8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -0,0 +1,301 @@
+from abc import abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+
+    @abstractmethod
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+    ) -> torch.Tensor:
+        return self.forward(x, layer.w13_weight, layer.w2_weight,
+                            router_logits, top_k, renormalize,
+                            use_grouped_topk, num_expert_group, topk_group)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
+        return fused_moe(x,
+                         w1,
+                         w2,
+                         router_logits,
+                         top_k,
+                         renormalize=renormalize,
+                         inplace=True,
+                         use_grouped_topk=use_grouped_topk,
+                         num_expert_group=num_expert_group,
+                         topk_group=topk_group)
+
+    def forward_cpu(self, *args, **kwargs):
+        raise NotImplementedError(
+            "The CPU backend currently does not support MoE.")
+
+    def forward_tpu(
+        self,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        return fused_moe(x, w1, w2, router_logits, top_k, renormalize)
+
+
+class FusedMoE(torch.nn.Module):
+    """FusedMoE layer for MoE models.
+
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    w13) and RowParallelLinear weights (down_proj/ w2).
+
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.tp_size = (tp_size if tp_size is not None else
+                        get_tensor_model_parallel_world_size())
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+
+        if quant_config is None:
+            self.quant_method: Optional[QuantizeMethodBase] = (
+                UnquantizedFusedMoEMethod())
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader)
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: int, expert_id: int):
+        param_data = param.data
+
+        # Input scales can be loaded directly and should be equal.
+        if "input_scale" in weight_name:
+            if param_data[expert_id] != 1 and (param_data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param_data[expert_id]} "
+                    f"vs. {loaded_weight}")
+            param_data[expert_id] = loaded_weight
+        # Weight scales
+        elif "weight_scale" in weight_name:
+            # If we are in merged column case (gate_up_proj)
+            #   shard_id 0 == gate_proj / w1
+            #   shard_id 2 == up_proj / w3
+            if shard_id == 0 or shard_id == 2:
+                # We have to keep the weight scales of w1 and w3 because
+                # we need to re-quantize w1/w3 weights after weight loading.
+                idx = 0 if shard_id == 0 else 1
+                param_data[expert_id][idx] = loaded_weight
+            # If we are in the row parallel case (down_proj)
+            #   shard_id 1 == down_proj / w2
+            else:
+                param_data[expert_id] = loaded_weight
+        # Weights
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = self.intermediate_size_per_partition
+            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+
+            # w1, gate_proj case: Load into first shard of w13.
+            if shard_id == 0:
+                param_data[expert_id,
+                           0:shard_size, :] = loaded_weight[shard, :]
+            # w3, up_proj case: Load into second shard of w13.
+            elif shard_id == 2:
+                param_data[expert_id, shard_size:2 *
+                           shard_size, :] = loaded_weight[shard, :]
+            # w2, down_proj case: Load into only shard of w2.
+            elif shard_id == 1:
+                param_data[expert_id, :, :] = loaded_weight[:, shard]
+            else:
+                raise ValueError(
+                    f"Shard id must be in [0,1,2] but got {shard_id}")
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            num_expert_group=self.num_expert_group,
+            topk_group=self.topk_group)
+
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, int]]:
+
+        gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
+        gate_down_up = [
+            ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
+        ]
+
+        return [
+            # These are the weight scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_scale"
+             if weight_name in gate_up else "experts.w2_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
+             shard_id) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the weights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_weight"
+             if weight_name in gate_up else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
+            for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the weight scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.a13_scale"
+             if weight_name in gate_up else "experts.a2_scale",
+             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
+             shard_id) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ]
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
new file mode 100644
index 0000000000000..563ee18c64304
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn.functional as F
+from torch_xla.experimental.custom_kernel import _histogram
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+    assert (num_tokens * topk) % 16 == 0, (
+        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
+        f"16 but got {num_tokens * topk}")
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    topk_indices = topk_indices.flatten()
+    topk_argsort_indices = topk_indices.argsort()
+    topk_argsort_revert_indices = topk_argsort_indices.argsort()
+    token_indices = torch.arange(num_tokens,
+                                 device=device).repeat_interleave(topk)
+    token_indices = token_indices[topk_argsort_indices]
+    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
+
+    # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout
+    # from HF Transformers.
+    w1 = w1.transpose(1, 2)
+    w2 = w2.transpose(1, 2)
+
+    x = hidden_states[token_indices]
+    x = torch.ops.xla.gmm(x, w1, group_sizes)
+    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
+    x = torch.ops.xla.gmm(x, w2, group_sizes)
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+
+    x = x * topk_weights.unsqueeze_(dim=-1)
+    x = x.sum(dim=-2)
+    x = x.reshape(orig_shape)
+    return x
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 14f5e2378a421..7a8699e3932cb 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -95,3 +95,49 @@ def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
         return s
+
+
+class GemmaRMSNorm(CustomOp):
+    """RMS normalization for Gemma.
+
+    Two differences from the above RMSNorm:
+        1. x * (1 + w) instead of x * w.
+        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # TODO(woosuk): Implement an optimized kernel for GemmaRMSNorm.
+        return self.forward_native(x, residual)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 45f805547b414..0e0a2b72f93d4 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -41,6 +41,29 @@ def adjust_bitsandbytes_shard(param: Parameter,
     return quantized_size, quantized_offset
 
 
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to 
+    one of the shards on disk. Here, we slice the param based on 
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -76,15 +99,7 @@ def apply(self,
 
 
 class UnquantizedLinearMethod(LinearMethodBase):
-    """Linear method without quantization.
-
-    Args:
-        separate_bias_add: If true, add bias separately after matrix
-                           multiplication.
-    """
-
-    def __init__(self, separate_bias_add: bool = False):
-        self.separate_bias_add = separate_bias_add
+    """Linear method without quantization."""
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
@@ -103,12 +118,8 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        weight = layer.weight
-        if self.separate_bias_add:
-            if bias is not None:
-                return F.linear(x, weight) + bias
-            return F.linear(x, weight)
-        return F.linear(x, weight, bias)
+
+        return F.linear(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
@@ -130,6 +141,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -144,7 +156,8 @@ def __init__(
             self.quant_method: Optional[
                 QuantizeMethodBase] = UnquantizedLinearMethod()
         else:
-            self.quant_method = quant_config.get_quant_method(self)
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
@@ -160,6 +173,8 @@ class ReplicatedLinear(LinearBase):
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(self,
@@ -168,15 +183,23 @@ def __init__(self,
                  bias: bool = True,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix=prefix)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
-        self.quant_method.create_weights(self, self.input_size,
-                                         [self.output_size], self.input_size,
-                                         self.output_size, self.params_dtype)
+        self.quant_method.create_weights(self,
+                                         self.input_size, [self.output_size],
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
+                                         prefix=prefix)
 
         if bias:
             self.bias = Parameter(
@@ -185,6 +208,15 @@ def __init__(self,
         else:
             self.register_parameter("bias", None)
 
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
@@ -219,6 +251,8 @@ class ColumnParallelLinear(LinearBase):
         quant_config: Quantization configure.
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj) 
     """
 
     def __init__(self,
@@ -229,9 +263,10 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[List[int]] = None):
+                 output_sizes: Optional[List[int]] = None,
+                 prefix: str = ""):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, prefix)
 
         self.gather_output = gather_output
 
@@ -256,7 +291,8 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            weight_loader=self.weight_loader)
+            weight_loader=self.weight_loader,
+            prefix=prefix)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
@@ -269,10 +305,6 @@ def __init__(self,
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         param_data = param.data
@@ -281,11 +313,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -332,6 +364,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(self,
@@ -341,7 +375,8 @@ def __init__(self,
                  gather_output: bool = False,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -351,7 +386,8 @@ def __init__(self,
                          gather_output=gather_output,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         prefix=prefix)
 
     def weight_loader(self,
                       param: Parameter,
@@ -362,33 +398,16 @@ def weight_loader(self,
         output_dim = getattr(param, "output_dim", None)
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
-
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
@@ -449,15 +468,9 @@ def weight_loader(self,
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
 
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
 
         else:
@@ -494,6 +507,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(self,
@@ -504,7 +519,8 @@ def __init__(self,
                  bias: bool = True,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         self.hidden_size = hidden_size
         self.head_size = head_size
         self.total_num_heads = total_num_heads
@@ -536,7 +552,8 @@ def __init__(self,
                          gather_output=False,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         prefix=prefix)
 
     def weight_loader(self,
                       param: Parameter,
@@ -547,32 +564,16 @@ def weight_loader(self,
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
 
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
@@ -661,15 +662,9 @@ def weight_loader(self,
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -717,14 +712,16 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  reduce_results: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, prefix)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
 
         # Divide the weight matrix along the last dimension.
+        self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
@@ -735,7 +732,8 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            weight_loader=self.weight_loader)
+            weight_loader=self.weight_loader,
+            prefix=prefix)
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
                              "results can lead to incorrect results")
@@ -751,10 +749,6 @@ def __init__(self,
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
@@ -764,20 +758,15 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
 
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
-
-        if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
     def forward(self, input_):
-        # Set up backprop all-reduce.
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -788,18 +777,19 @@ def forward(self, input_):
 
         # Matrix multiply.
         assert self.quant_method is not None
-        output_parallel = self.quant_method.apply(self, input_parallel)
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        output_parallel = self.quant_method.apply(self,
+                                                  input_parallel,
+                                                  bias=bias_)
         if self.reduce_results and self.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
+            output = tensor_model_parallel_all_reduce(output_parallel)
         else:
-            output_ = output_parallel
+            output = output_parallel
+
+        output_bias = self.bias if self.skip_bias_add else None
 
-        if not self.skip_bias_add:
-            output = output_ + self.bias if self.bias is not None else output_
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.bias
         return output, output_bias
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 7eee599473a11..f6fcf49ef464b 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -6,6 +6,8 @@
 import torch.nn as nn
 
 from vllm.distributed import tensor_model_parallel_gather
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
 
@@ -22,7 +24,8 @@ def __init__(self,
                  vocab_size: int,
                  org_vocab_size: Optional[int] = None,
                  scale: float = 1.0,
-                 logits_as_input: bool = False) -> None:
+                 logits_as_input: bool = False,
+                 soft_cap: Optional[float] = None) -> None:
         """
         Args:
             scale: A scaling factor to apply to the logits.
@@ -34,10 +37,12 @@ def __init__(self,
         self.logits_as_input = logits_as_input
         # original vocabulary size (without LoRA).
         self.org_vocab_size = org_vocab_size or vocab_size
+        # Soft cap the logits. Used in Gemma 2.
+        self.soft_cap = soft_cap
 
     def forward(
         self,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
@@ -49,9 +54,13 @@ def forward(
                                                  sampling_metadata)
 
             # Get the logits for the next tokens.
-            logits = self._get_logits(hidden_states, embedding, embedding_bias)
-
+            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
         if logits is not None:
+            if self.soft_cap is not None:
+                logits = logits / self.soft_cap
+                logits = torch.tanh(logits)
+                logits = logits * self.soft_cap
+
             if self.scale != 1.0:
                 logits *= self.scale
 
@@ -60,12 +69,13 @@ def forward(
 
         return logits
 
-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+    def _get_logits(self, hidden_states: torch.Tensor,
+                    lm_head: VocabParallelEmbedding,
                     embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
-        if embedding_bias is not None:
-            logits += embedding_bias
+        logits = lm_head.linear_method.apply(lm_head,
+                                             hidden_states,
+                                             bias=embedding_bias)
         logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 40b0df75a69a6..bd574512e3431 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -2,6 +2,7 @@
 
 from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.bitsandbytes import (
@@ -10,6 +11,7 @@
     CompressedTensorsConfig)
 from vllm.model_executor.layers.quantization.deepspeedfp import (
     DeepSpeedFPConfig)
+from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
@@ -24,11 +26,13 @@
     "awq": AWQConfig,
     "deepspeedfp": DeepSpeedFPConfig,
     "fp8": Fp8Config,
+    "fbgemm_fp8": FBGEMMFp8Config,
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
     "marlin": MarlinConfig,
     "gptq_marlin_24": GPTQMarlin24Config,
     "gptq_marlin": GPTQMarlinConfig,
+    "awq_marlin": AWQMarlinConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "compressed-tensors": CompressedTensorsConfig,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 730595c3d36d1..95ff05b986ab4 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -207,8 +207,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
         return cls(in_group_size, nbits_per_codebook, num_code_books,
                    out_group_size)
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["AQLMLinearMethod"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["AQLMLinearMethod"]:
         if isinstance(layer, LinearBase):
             return AQLMLinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index f4fc7ce020e95..ce2fa62ef565f 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -43,7 +43,8 @@ def get_name(self) -> str:
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # The AWQ kernel only supports Turing or newer GPUs.
         return 75
 
@@ -62,8 +63,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig":
         zero_point = cls.get_from_keys(config, ["zero_point"])
         return cls(weight_bits, group_size, zero_point)
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["AWQLinearMethod"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["AWQLinearMethod"]:
         if isinstance(layer, LinearBase):
             return AWQLinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
new file mode 100644
index 0000000000000..092f87b623e7f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -0,0 +1,268 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_awq_marlin_linear, awq_to_marlin_zero_points,
+    check_awq_marlin_supported, marlin_make_empty_g_idx, marlin_make_workspace,
+    marlin_permute_scales, replace_tensor, verify_awq_marlin_supported,
+    verify_marlin_supports_shape)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+
+logger = init_logger(__name__)
+
+
+class AWQMarlinConfig(QuantizationConfig):
+    """Config class for AWQ Marlin"""
+
+    def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
+                 lm_head_quantized: bool) -> None:
+        self.weight_bits = weight_bits
+        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.lm_head_quantized = lm_head_quantized
+
+        verify_awq_marlin_supported(num_bits=self.weight_bits,
+                                    group_size=self.group_size,
+                                    has_zp=self.has_zp)
+
+    def __repr__(self) -> str:
+        return (f"AWQMarlinConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"has_zp={self.has_zp}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "awq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        has_zp = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, has_zp, lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "awq":
+            logger.info("Detected that the model can run with awq_marlin"
+                        ", however you specified quantization=awq explicitly,"
+                        " so forcing awq. Use quantization=awq_marlin for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["AWQMarlinLinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+            return AWQMarlinLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    @classmethod
+    def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits", None)
+        group_size = quant_config.get("group_size", None)
+        has_zp = quant_config.get("zero_point", None)
+
+        if quant_method != "awq":
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or has_zp is None):
+            return False
+
+        return check_awq_marlin_supported(
+            num_bits=num_bits,
+            group_size=group_size,
+            has_zp=has_zp,
+            min_capability=cls.get_min_capability())
+
+
+class AWQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for AWQ Marlin.
+
+    Args:
+        quant_config: The AWQ Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: AWQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
+
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight, {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = Parameter(
+            torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qzeros, {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+
+        scales = Parameter(
+            torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(scales, {
+            "input_dim": 0,
+            "output_dim": 1,
+        })
+
+        layer.register_parameter("qweight", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+        layer.register_parameter("qzeros", qzeros)
+        set_weight_attrs(qzeros, extra_weight_attrs)
+        layer.register_parameter("scales", scales)
+        set_weight_attrs(scales, extra_weight_attrs)
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.num_groups = num_groups
+
+    # TODO: Update this docs
+    # Checkpoints are serialized in AutoAWQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
+
+        # Repack weights from AWQ format to marlin format.
+        marlin_qweight = ops.awq_marlin_repack(
+            layer.qweight,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.weight_bits)
+        replace_tensor(layer, "qweight", marlin_qweight)
+
+        # Permute scales from AWQ format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size)
+        replace_tensor(layer, "scales", marlin_scales)
+
+        # Permute zero-points from AWQ format to marlin format.
+        marlin_zp = awq_to_marlin_zero_points(
+            layer.qzeros,
+            size_k=layer.num_groups,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.weight_bits)
+        replace_tensor(layer, "qzeros", marlin_zp)
+
+        # Not-used
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_awq_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            weight_zp=layer.qzeros,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            num_bits=self.quant_config.weight_bits,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            bias=bias)
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index e7de283b562a6..f5ff27b9f14b7 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -44,8 +44,9 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
         """List of supported activation dtypes."""
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         """Minimum GPU capability to support the quantization method.
 
         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
@@ -86,13 +87,23 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
         raise ValueError(f"Cannot find any of {keys} in the model's "
                          "quantization config.")
 
+    @staticmethod
+    def get_from_keys_or(config: Dict[str, Any], keys: List[str],
+                         default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        try:
+            return QuantizationConfig.get_from_keys(config, keys)
+        except ValueError:
+            return default
+
     @abstractmethod
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional[QuantizeMethodBase]:
         """Get the quantize method to use for the quantized layer.
         
         Args:
             layer: The layer for the quant method.
+            prefix: The full name of the layer in the state dict
         Returns:
             The quantize method. None if the given layer doesn't support quant
             method.
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 969958d9b5448..4a68da5a2323e 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -38,7 +38,7 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.float32, torch.float16, torch.bfloat16]
 
     @classmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
@@ -60,9 +60,8 @@ def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
             target_modules = cls.get_from_keys(config, ["target_modules"])
         return cls(adapter_name, target_modules)
 
-    def get_quant_method(
-            self,
-            layer: torch.nn.Module) -> Optional["BitsAndBytesLinearMethod"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["BitsAndBytesLinearMethod"]:
         if isinstance(layer, LinearBase):
             return BitsAndBytesLinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 44dd024afe74d..c4d0c9cb981da 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -5,23 +5,33 @@
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    CompressedTensorsScheme, CompressedTensorsUnquantized,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    find_first_name_or_class_match)
+    QuantizationType, find_matched_target, is_activation_quantization_format,
+    should_ignore_layer)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.platforms import current_platform
 
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str],
-                 quant_format: str):
+    def __init__(self,
+                 target_scheme_map: Dict[str, Any],
+                 ignore: List[str],
+                 quant_format: str,
+                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+
         self.ignore = ignore
-        self.layer_quant_details = layer_quant_details
         self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.kv_cache_scheme = kv_cache_scheme
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -32,23 +42,30 @@ def get_scaled_act_names(self) -> List[str]:
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
-    # Need to figure it out
-    def get_min_capability(self) -> int:
-        return 60
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
 
     def get_name(self) -> str:
         return "compressed_tensors"
 
+    # TODO (@robertgshaw2-neuralmagic): do layer skipping though here
+    # rather than though create_weights to match other methods
     def get_quant_method(
-            self, layer: torch.nn.Module
-    ) -> Optional["CompressedTensorsLinearMethod"]:
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
             return CompressedTensorsLinearMethod(self)
+        if isinstance(layer, Attention):
+            return CompressedTensorsKVCacheMethod(self)
         return None
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
-        layer_quant_details: Dict[str, Any] = dict()
+        target_scheme_map: Dict[str, Any] = dict()
         ignore: List[str] = config.get("ignore", None)
         quant_format: str = config.get("format", None)
 
@@ -60,28 +77,38 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for key, quant_config in config["config_groups"].items():
+        for _, quant_config in config["config_groups"].items():
             targets = quant_config.get("targets")
             for target in targets:
-                layer_quant_details[target] = {}
-                layer_quant_details[target][
+                target_scheme_map[target] = {}
+                target_scheme_map[target][
                     "weights"] = QuantizationArgs.parse_obj(
                         quant_config.get("weights"))
                 try:
-                    layer_quant_details[target][
+                    target_scheme_map[target][
                         "input_activations"] = QuantizationArgs.parse_obj(
                             quant_config.get("input_activations"))
                 except Exception:
-                    layer_quant_details[target]["input_activations"] = None
+                    target_scheme_map[target]["input_activations"] = None
 
-        return cls(layer_quant_details=layer_quant_details,
+        return cls(target_scheme_map=target_scheme_map,
                    ignore=ignore,
-                   quant_format=quant_format)
+                   quant_format=quant_format,
+                   kv_cache_scheme=config.get("kv_cache_scheme"))
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
         return []
 
+    def _check_scheme_supported(self, min_capability: int):
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < min_capability:
+            raise RuntimeError(
+                "Quantization scheme is not supported for ",
+                f"the current GPU. Min capability: {min_capability}. ",
+                f"Current capability: {capability}.")
+
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
@@ -108,62 +135,139 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
 
         return is_8_bits and is_token and is_symmetric and is_dynamic
 
-    def _is_w4a16(self, weight_quant: BaseModel,
-                  input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8(self, weight_quant: BaseModel,
+                     input_quant: BaseModel) -> bool:
+        # Confirm weights and activations quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if not (weight_quant.type == QuantizationType.FLOAT
+                and input_quant.type == QuantizationType.FLOAT):
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        ])
+        if not (is_symmetric_weight and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.dynamic:
+            return True
+
+        # Confirm activation scheme is supported.
+        is_symmetric_activation = input_quant.symmetric
+        is_per_tensor_activation = (
+            input_quant.strategy == QuantizationStrategy.TENSOR)
+        if not (is_symmetric_activation and is_per_tensor_activation):
+            return False
+
+        # All conditions satisfied.
+        return True
+
+    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
+                                input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_4_bits = weight_quant.num_bits == 4
         is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return is_4_bits and input_quant_none and is_symmetric and is_static
+        return (is_channel_group and input_quant_none and is_symmetric
+                and is_static)
 
-    def _get_schema(self, weight_quant: BaseModel,
-                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+    def _get_scheme_from_parts(
+            self, weight_quant: BaseModel,
+            input_quant: BaseModel) -> "CompressedTensorsScheme":
 
-        if self._is_w4a16(weight_quant, input_quant):
-            if self.quant_format == CompressionFormat.marlin_24.value:
+        # Detect If Mixed Precision
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if (self.quant_format == CompressionFormat.marlin_24.value
+                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if self.quant_format == CompressionFormat.pack_quantized.value:
-                return CompressedTensorsW4A16(
+            if (self.quant_format == CompressionFormat.pack_quantized.value
+                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
 
-        if self.quant_format == CompressionFormat.int_quantized.value:
+        # Detect If Activation Quantization.
+        if is_activation_quantization_format(self.quant_format):
+            if self._is_fp8_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Fp8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=(not input_quant.dynamic))
+
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8StaticTensor(
-                    strategy=weight_quant.strategy)
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=True)
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8DynamicToken(
-                    strategy=weight_quant.strategy)
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=False)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
+    def get_scheme(
+            self,
+            layer: torch.nn.Module,
+            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        ignore: List of layer_names or nn.Module names to be ignored.
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        We first check whether a layer is in the ignore group and use
+        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
+
+        We then detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for infernece.
+        """
 
-        layer_type_name = find_first_name_or_class_match(
-            name="",
+        # Check if the layer is skipped for quantization.
+        # TODO (@robertgshaw2): support module names
+        if should_ignore_layer(layer_name, ignore=self.ignore):
+            return CompressedTensorsUnquantized()
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        # TODO (@robertgshaw): add compressed-tensors as dep
+        # so we do not have to re-write these functions
+        matched_target = find_matched_target(
+            layer_name=layer_name,
             module=layer,
-            targets=self.layer_quant_details.keys(),
-            check_contains=True)
+            targets=self.target_scheme_map.keys())
+
+        # Find the quant_scheme
+        scheme = self.target_scheme_map[matched_target]
 
-        if layer_type_name is None:
-            raise ValueError(f"Could not matching target for layer {layer}")
+        return self._get_scheme_from_parts(
+            weight_quant=scheme["weights"],
+            input_quant=scheme["input_activations"])
 
-        layer_quant_details: Dict[str, Any] = self.layer_quant_details.get(
-            layer_type_name, None)
-        if layer_quant_details is None:
-            raise ValueError(
-                f"Could not find quantization details for {layer}.")
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
 
-        return self._get_schema(
-            weight_quant=layer_quant_details["weights"],
-            input_quant=layer_quant_details["input_activations"])
+        return scheme
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
@@ -171,6 +275,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
     def __init__(self, quantization_config: CompressedTensorsConfig):
         self.quantization_config = quantization_config
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
@@ -180,11 +287,11 @@ def create_weights(self, layer: torch.nn.Module,
         Use the CompressedTensorsScheme associated with each layer to create 
         the necessary parameters for the layer. See LinearMethodBase for param
         details
-
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
+        layer_name = extra_weight_attrs.get("prefix")
 
-        scheme = self.quantization_config.get_scheme(layer=layer)
+        scheme = self.quantization_config.get_scheme(layer, layer_name)
         scheme.create_weights(
             layer=layer,
             input_size=input_size,
@@ -207,10 +314,51 @@ def apply(self,
 
         """
 
-        if bias is not None:
-            raise ValueError("bias is not supported for this linear method")
-
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x)
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from compressed-tensors
+    checkpoints.
+    """
+
+    def __init__(self, quant_config: CompressedTensorsConfig):
+        self.validate_kv_cache_scheme(quant_config.kv_cache_scheme)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_scheme(kv_cache_scheme: Optional[Dict[str, Any]]):
+        """
+        Validator for the kv cache scheme. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_scheme: the compressed-tensors kv cache scheme
+        """
+        if kv_cache_scheme is None:
+            return
+
+        type_ = kv_cache_scheme.get("type")
+        num_bits = kv_cache_scheme.get("num_bits")
+
+        if type_ != "float" and num_bits != 8:
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                "num_bits=8, type=float, however "
+                f"received num_bits={num_bits}, type={type_}")
+
+        strategy = kv_cache_scheme.get("strategy")
+        if strategy != "tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for compressed-tensors KV cache. "
+                f"Expected strategy: tensor, found strategy: {strategy}")
+
+        is_symmetric = kv_cache_scheme.get("symmetric")
+        if not is_symmetric:
+            raise NotImplementedError(
+                "Only support symmetric scaling factor "
+                "for compressed-tensors KV cache. "
+                f"However found symmetric: {is_symmetric}")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 3c95aa11fc76c..dd94c49827f62 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,10 +1,19 @@
-from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
-from .compressed_tensors_unquantized import (  # noqa: F401
-    CompressedTensorsUnquantized)
-from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
-from .compressed_tensors_w4a16_24 import (  # noqa: F401
-    CompressedTensorsW4A16Sparse24)
-from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
-    CompressedTensorsW8A8DynamicToken)
-from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
-    CompressedTensorsW8A8StaticTensor)
+from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_unquantized import CompressedTensorsUnquantized
+from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
+                                          CompressedTensorsW4A16Sparse24)
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
+                                       CompressedTensorsWNA16)
+
+__all__ = [
+    "CompressedTensorsScheme",
+    "CompressedTensorsUnquantized",
+    "CompressedTensorsWNA16",
+    "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8",
+    "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS",
+    "W4A16SPARSE24_SUPPORTED_BITS",
+]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 3a5904208656e..d5f37b47bb87e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Optional
 
 import torch
 
@@ -11,6 +12,13 @@ class CompressedTensorsScheme(ABC):
     of different quantization schemes supported by CompressedTensors.
     """
 
+    @abstractmethod
+    def get_min_capability(self) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def create_weights(self, *args, **kwargs):
         """
@@ -20,14 +28,24 @@ def create_weights(self, *args, **kwargs):
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
         """
         Run the forward pass for the particular scheme. This is where 
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: toch.nn.Module with the registered weights and 
+        :param layer: torch.nn.Module with the registered weights and 
             other parameters relevant to the particular scheme. 
         :param x: input to the layer
+        :param bias: bias parameter
 
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 0cfac13d1ca25..6203f02d25e90 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -1,4 +1,4 @@
-from typing import Callable, List
+from typing import Callable, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -18,6 +18,13 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
     in a linear transformation.
     """
 
+    def get_min_capability(self) -> int:
+        # volta and up
+        return 70
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
@@ -26,7 +33,6 @@ def create_weights(self, layer: torch.nn.Module,
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
-                                       device="cuda",
                                        dtype=params_dtype),
                            requires_grad=False)
 
@@ -34,6 +40,7 @@ def create_weights(self, layer: torch.nn.Module,
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {"weight_loader": weight_loader})
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        return F.linear(x, weight)
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+        return F.linear(x, layer.weight, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
deleted file mode 100644
index 373458cfffe04..0000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from typing import Callable, List, Optional
-
-import torch
-from torch.nn import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
-    marlin_permute_scales)
-from vllm.model_executor.utils import set_weight_attrs
-
-__all__ = ["CompressedTensorsW4A16"]
-
-
-class CompressedTensorsW4A16(CompressedTensorsScheme):
-
-    def __init__(self,
-                 strategy: str,
-                 num_bits: int,
-                 group_size: Optional[int] = None):
-        self.num_bits = num_bits
-        self.strategy = strategy
-        self.group_size = group_size
-
-        if self.strategy == "group" and self.group_size is None:
-            raise ValueError(
-                "group_size must be given when using strategy group")
-
-    def create_weights(self, layer: torch.nn.Module, input_size: int,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        pack_factor = 32 // self.num_bits
-        output_size_per_partition = sum(output_partition_sizes)
-
-        if self.group_size is not None:
-            group_size = self.group_size
-        else:
-            group_size = input_size
-
-        weight_scale_dim = None
-        scales_and_zp_size = input_size // group_size
-
-        if (input_size != input_size_per_partition
-                and self.group_size is not None):
-            weight_scale_dim = 1
-            scales_and_zp_size = input_size_per_partition // group_size
-
-        weight = Parameter(
-            torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // pack_factor,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": pack_factor,
-                "weight_loader": weight_loader
-            })
-        layer.register_parameter("weight_packed", weight)
-
-        weight_scale = Parameter(
-            torch.empty(
-                output_size_per_partition,
-                scales_and_zp_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "input_dim": weight_scale_dim,
-                "output_dim": 0
-            })
-        layer.register_parameter("weight_scale", weight_scale)
-
-        # A 2D array defining the original shape of the weights
-        # before packing
-        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
-                                 requires_grad=False)
-
-        layer.register_parameter("weight_shape", weight_shape)
-        set_weight_attrs(weight_shape, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True,
-        })
-
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-
-        layer.input_size = input_size
-        layer.marlin_state = GPTQMarlinState.REPACK
-        layer.is_k_full = True
-        layer.group_size = group_size
-
-        max_workspace_size = (
-            output_size_per_partition //
-            GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
-
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                requires_grad=False)
-        layer.workspace = workspace
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        reshaped_x = x.reshape(-1, x.shape[-1])
-
-        size_m = reshaped_x.shape[0]
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-
-        out_shape = x.shape[:-1] + (part_size_n, )
-
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            cur_device = layer.weight_packed.device
-
-            # Reset g_idx related tensors
-            layer.g_idx = Parameter(torch.empty(0,
-                                                dtype=torch.int,
-                                                device=cur_device),
-                                    requires_grad=False)
-            layer.g_idx_sort_indices = Parameter(torch.empty(
-                0, dtype=torch.int, device=cur_device),
-                                                 requires_grad=False)
-
-            # Repack weights
-            marlin_qweight = ops.gptq_marlin_repack(
-                layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices,
-                part_size_k, part_size_n, self.num_bits)
-
-            replace_tensor("weight_packed", marlin_qweight)
-
-            # Permute scales
-            scales_size_k = part_size_k
-            scales_size_n = part_size_n
-
-            marlin_scales = marlin_permute_scales(
-                layer.weight_scale.squeeze().t().contiguous(), scales_size_k,
-                scales_size_n, layer.group_size, self.num_bits)
-            replace_tensor("weight_scale", marlin_scales)
-
-        output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed,
-                                      layer.weight_scale, layer.g_idx,
-                                      layer.g_idx_sort_indices,
-                                      layer.workspace, self.num_bits, size_m,
-                                      part_size_n, part_size_k,
-                                      layer.is_k_full)
-        return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index d7e04ddb8d94a..eec523d00372c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -11,6 +11,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_BITS = [4]
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
@@ -28,6 +29,13 @@ def __init__(self,
             raise ValueError(
                 "group_size must be given when using strategy group")
 
+    def get_min_capability(self) -> int:
+        # ampere + up
+        return 80
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
@@ -114,7 +122,9 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                               requires_grad=False)
         layer.workspace = workspace
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
         qweight = layer.weight_packed
         meta = layer.meta
         scales = layer.scale_packed
@@ -131,4 +141,8 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
                                             size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
         return output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
deleted file mode 100644
index efed79ec7a11c..0000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import Callable, List, Tuple, Union
-
-import torch
-from torch.nn import Parameter
-
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationStrategy)
-from vllm.model_executor.utils import set_weight_attrs
-
-
-class CompressedTensorsW8A8(CompressedTensorsScheme):
-
-    def __init__(self, strategy: str):
-        self.strategy = strategy
-
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(output_partition_sizes) if (
-            is_tensor_partitioned
-            or self.strategy == QuantizationStrategy.CHANNEL) else 1
-
-        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (weight_scale_dim, 1)
-
-        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
-                                 requires_grad=False)
-
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "weight_loader": weight_loader,
-                "logical_widths": output_partition_sizes
-            })
-
-        # Don't need a shard_splitter for channel-wise quantization
-        # Use the default loading method
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(
-                weight_scale, {
-                    "logical_widths": output_partition_sizes,
-                    "shard_splitter": self.scales_shard_splitter,
-                })
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
deleted file mode 100644
index 5fc05b8e682d6..0000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Callable, List
-
-import torch
-
-from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
-    CompressedTensorsW8A8)
-
-__all__ = ["CompressedTensorsW8A8DynamicToken"]
-
-
-class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8):
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        super().create_weights(
-            layer=layer,
-            output_partition_sizes=output_partition_sizes,
-            input_size_per_partition=input_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=weight_loader)
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        weight_scale = layer.weight_scale
-
-        x_q, input_scales = custom_ops.scaled_int8_quant(x)
-        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
-                                            weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
new file mode 100644
index 0000000000000..51156a3bc07af
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -0,0 +1,107 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, create_per_channel_scale_param,
+    create_per_tensor_scale_param, cutlass_fp8_supported,
+    requantize_with_max_scale)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A8Fp8"]
+
+
+class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def get_min_capability(self) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.strategy == QuantizationStrategy.TENSOR:
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.strategy == QuantizationStrategy.CHANNEL:
+            weight = layer.weight
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+
+        else:
+            raise ValueError(f"Unknown quantization strategy {self.strategy}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
+                                                input_size_per_partition,
+                                                dtype=torch.float8_e4m3fn),
+                                    requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = create_per_channel_scale_param(
+                output_partition_sizes, weight_loader=weight_loader)
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            weight_scale = create_per_tensor_scale_param(
+                output_partition_sizes, weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
new file mode 100644
index 0000000000000..e81496c89ac7f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -0,0 +1,92 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_int8_linear, convert_to_channelwise, create_per_channel_scale_param,
+    create_per_tensor_scale_param)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    def get_min_capability(self) -> int:
+        # turing and up
+        return 75
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = layer.weight
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(self.logical_widths) > 1
+        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(layer.weight_scale,
+                                                    self.logical_widths)
+            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        self.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            scale = create_per_channel_scale_param(output_partition_sizes,
+                                                   **layer_kwargs)
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **layer_kwargs)
+        layer.register_parameter("weight_scale", scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **layer_kwargs)
+            layer.register_parameter("input_scale", scale)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+        return apply_int8_linear(input=x,
+                                 weight=layer.weight,
+                                 weight_scale=layer.weight_scale,
+                                 input_scale=layer.input_scale,
+                                 bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
deleted file mode 100644
index 79f5358a365ed..0000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from typing import Callable, List
-
-import torch
-from torch.nn import Parameter
-
-from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
-    CompressedTensorsW8A8)
-from vllm.model_executor.utils import set_weight_attrs
-
-__all__ = ["CompressedTensorsW8A8StaticTensor"]
-
-
-class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8):
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        super().create_weights(
-            layer=layer,
-            output_partition_sizes=output_partition_sizes,
-            input_size_per_partition=input_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=weight_loader)
-
-        input_scale = Parameter(torch.empty(1, dtype=torch.float32),
-                                requires_grad=False)
-
-        layer.register_parameter("input_scale", input_scale)
-        set_weight_attrs(input_scale, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True,
-        })
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        weight_scale = layer.weight_scale
-        act_scale = layer.input_scale
-
-        # Input quantize
-        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
-
-        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
-                                            weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
new file mode 100644
index 0000000000000..e4cf0c0b5d95b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -0,0 +1,173 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
+    marlin_permute_scales, replace_tensor, verify_gptq_marlin_supported,
+    verify_marlin_supports_shape)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_BITS = [4, 8]
+
+
+class CompressedTensorsWNA16(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.num_bits = num_bits
+        self.pack_factor = 32 // self.num_bits
+        self.strategy = strategy
+
+        self.group_size: int
+        if group_size is None:
+            if self.strategy != "channel":
+                raise ValueError(
+                    "Marlin kernels require group quantization or "
+                    "channelwise quantization, but found no group "
+                    "size and strategy is not channelwise.")
+            self.group_size = -1
+        else:
+            self.group_size = group_size
+
+        # Verify supported on platform.
+        verify_gptq_marlin_supported(num_bits=self.num_bits,
+                                     group_size=self.group_size,
+                                     is_sym=True)
+
+    def get_min_capability(self) -> int:
+        # ampere and up
+        return 80
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+
+        # If group_size is -1, we are in channelwise case.
+        group_size = input_size if self.group_size == -1 else self.group_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
+
+        weight_scale_dim = None
+        scales_and_zp_size = input_size // group_size
+
+        if (input_size != input_size_per_partition
+                and self.group_size is not None):
+            weight_scale_dim = 1
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        weight = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": self.pack_factor,
+                "weight_loader": weight_loader
+            })
+        layer.register_parameter("weight_packed", weight)
+
+        weight_scale = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            weight_scale, {
+                "weight_loader": weight_loader,
+                "input_dim": weight_scale_dim,
+                "output_dim": 0
+            })
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # A 2D array defining the original shape of the weights
+        # before packing
+        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_shape", weight_shape)
+        set_weight_attrs(weight_shape, {
+            "weight_loader": weight_loader,
+            "ignore_warning": True,
+        })
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.input_size = input_size
+        layer.group_size = group_size
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from marlin format. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.weight_packed.device
+
+        # Allocate marlin workspace.
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
+
+        # Act-order not supported in compressed-tensors yet, so set to empty.
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        # No zero-point
+        layer.weight_zp = marlin_make_empty_g_idx(device)
+
+        # Repack weights from compressed-tensors format to marlin format.
+        marlin_qweight = ops.gptq_marlin_repack(
+            layer.weight_packed.t().contiguous(),
+            perm=layer.g_idx_sort_indices,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.num_bits)
+        replace_tensor(layer, "weight_packed", marlin_qweight)
+
+        # Permute scales from compressed-tensors format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.weight_scale.squeeze().t().contiguous(),
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=layer.group_size)
+        replace_tensor(layer, "weight_scale", marlin_scales)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=layer.weight_packed,
+            weight_scale=layer.weight_scale,
+            weight_zp=layer.weight_zp,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            num_bits=self.num_bits,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            is_k_full=True,
+            bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index b2bec9b603d1a..7e8e70806a0fc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -9,6 +9,8 @@
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
+    naive_quantized = "naive-quantized"
+    float_quantized = "float-quantized"
     int_quantized = "int-quantized"
     pack_quantized = "pack-quantized"
     marlin_24 = "marlin-24"
@@ -75,25 +77,115 @@ class QuantizationArgs(BaseModel):
     )
 
 
-def find_first_name_or_class_match(
-        name: str,
-        module: Module,
-        targets: Iterable[str],
-        check_contains: bool = False) -> Optional[str]:
+def is_activation_quantization_format(format: str) -> bool:
+    _ACTIVATION_QUANTIZATION_FORMATS = [
+        CompressionFormat.naive_quantized.value,
+        CompressionFormat.int_quantized.value,
+        CompressionFormat.float_quantized.value
+    ]
+    return format in _ACTIVATION_QUANTIZATION_FORMATS
+
+
+# fused_name: List[shard_name]
+_FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"]
+}
+
+
+def should_ignore_layer(layer_name: Optional[str],
+                        ignore: Iterable[str]) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in _FUSED_LAYER_NAME_MAPPING:
+        shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str,
+                               targets: Iterable[str]) -> bool:
     """
-    Helper function to map the quantization details listed in the config 
-    for a given list of targets against each model layer. First uses the
-    layer name to try and find a match. If no name match is found, uses
-    the layer class name. Returns None otherwise.
+    Checks whether a layer_name is exactly equal or a regex match for 
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def find_matched_target(layer_name: Optional[str], module: Module,
+                        targets: Iterable[str]) -> str:
+    """
+    Helper function to look up which "target" in the compressed-tensors
+    config that a layer corresponds to.
+
+    Recall that a compressed-tensors configs has a concept of 
+    config_groups, where each layer can be quantized with with a different
+    scheme.
 
-    :param name: layer name
+    targets in each config_group will be a list of either layer names 
+    (or regexes corresponding to layer names) or names of torch Modules.
+
+    First, we try to match the layer_name with a target
+    Second, we try to match the module's name with a target
+
+    :param layer_name: layer name
     :param module: torch.nn.Module
     :param targets: list of targets to match the layer against
-    :param check_contains: whether or not to do a substring match
     """
 
-    return _find_first_match(name, targets) or _find_first_match(
-        module.__class__.__name__, targets, check_contains)
+    if layer_name is None:
+        layer_name = ""
+
+    matched_target = (_find_first_match(layer_name, targets)
+                      or _find_first_match(module.__class__.__name__, targets,
+                                           True))
+
+    if matched_target is None:
+        raise ValueError(f"Unable to find matching target for {module} in the "
+                         "compressed-tensors config.")
+
+    return matched_target
 
 
 def _find_first_match(value: str,
@@ -110,13 +202,46 @@ def _find_first_match(value: str,
     """
 
     for target in targets:
-        if target.startswith("re:"):
-            pattern = target[3:]
-            if re.match(pattern, value):
-                return target
-        elif check_contains:
-            if target.lower() in value.lower():
-                return target
-        elif target == value:
+        if _is_equal_or_regex_match(value,
+                                    target,
+                                    check_contains=check_contains):
             return target
     return None
+
+
+def get_compressed_tensors_cache_scale(name: str) -> Optional[str]:
+    """
+    Check whether the param name matches the format for k/v cache scales
+    in compressed-tensors. If this is the case, return its equivalent
+    param name expected by vLLM
+
+    :param name: param name
+    :return: matching param name for KV cache scale in vLLM
+    """
+    if name.endswith(".output_scale") and ".k_proj" in name:
+        return name.replace(".k_proj.output_scale", ".attn.k_scale")
+    if name.endswith(".output_scale") and ".v_proj" in name:
+        return name.replace(".v_proj.output_scale", ".attn.v_scale")
+    # If no matches, return None
+    return None
+
+
+def _is_equal_or_regex_match(value: str,
+                             target: str,
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 31cdffbcf0ab9..29484801dc380 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -69,9 +69,8 @@ def get_config_filenames() -> List[str]:
             "quantize_config.json",
         ]
 
-    def get_quant_method(
-            self,
-            layer: torch.nn.Module) -> Optional["DeepSpeedFPLinearMethod"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
         if isinstance(layer, LinearBase):
             return DeepSpeedFPLinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
new file mode 100644
index 0000000000000..6b329231ec3af
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -0,0 +1,182 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, create_per_channel_scale_param)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+# Note: this is a hack. We should update each model to register the
+# stacked params and get it from there instead in a future PR.
+# fused_name: List[shard_name]
+_FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"]
+}
+
+
+class FBGEMMFp8Config(QuantizationConfig):
+    """Config class for FBGEMM Fp8."""
+
+    def __init__(self, ignore_list: List[str], input_scale_ub: float):
+        self.ignore_list = ignore_list if ignore_list else []
+        self.input_scale_ub = input_scale_ub
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        self.use_marlin = capability < 89
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fbgemm_fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config":
+        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
+        input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
+        return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
+
+    def _is_layer_skipped(self, prefix: str) -> bool:
+        # prefix: model.layers.0.self_attn.q_proj
+        # proj_name: q_proj
+        proj_name = prefix.split(".")[-1]
+        if proj_name in _FUSED_LAYER_NAME_MAPPING:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = shard_prefix in self.ignore_list
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision.")
+        else:
+            is_skipped = prefix in self.ignore_list
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self._is_layer_skipped(prefix):
+                return UnquantizedLinearMethod()
+            return FBGEMMFp8LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class FBGEMMFp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quant_config: FBGEMMFp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=torch.float8_e4m3fn),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            **extra_weight_attrs,
+        })
+
+        # WEIGHT SCALE
+        weight_scale = create_per_channel_scale_param(output_partition_sizes,
+                                                      **extra_weight_attrs)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE UPPER BOUND
+        input_scale_ub = torch.nn.Parameter(torch.tensor(
+            (self.quant_config.input_scale_ub), dtype=torch.float32),
+                                            requires_grad=False)
+        layer.input_scale_ub = input_scale_ub
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        weight = layer.weight
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+
+        if self.quant_config.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale_ub
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.quant_config.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        return apply_fp8_linear(input=x,
+                                weight=layer.weight,
+                                weight_scale=layer.weight_scale,
+                                input_scale=None,
+                                input_scale_ub=layer.input_scale_ub,
+                                bias=bias,
+                                cutlass_fp8_supported=True,
+                                use_per_token_if_dynamic=True)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bbf3cde54782d..b2a1b0a9534e8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch.nn import Module
@@ -6,10 +6,19 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+                                                  fused_moe)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
+    cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -17,13 +26,6 @@
 logger = init_logger(__name__)
 
 
-def cutlass_fp8_supported() -> bool:
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-
-    return ops.cutlass_scaled_mm_supports_fp8(capability)
-
-
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -51,7 +53,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 89
+        return 80
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -65,13 +67,15 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme)
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
             return Fp8LinearMethod(self)
-        if isinstance(layer, Attention):
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
 
@@ -101,23 +105,11 @@ def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
-    def _create_scale_param(
-        self,
-        scale_name: str,
-        layer: torch.nn.Module,
-        output_partition_sizes: List[int],
-        **extra_weight_attrs,
-    ) -> None:
-        scale = Parameter(torch.empty(len(output_partition_sizes),
-                                      dtype=torch.float32),
-                          requires_grad=False)
-        layer.register_parameter(scale_name, scale)
-        set_weight_attrs(
-            scale, {
-                **extra_weight_attrs,
-                "fp8_scales_shard_indexer":
-                self.scales_shard_indexer,
-            })
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        self.use_marlin = capability < 89
 
     def create_weights(
         self,
@@ -132,9 +124,12 @@ def create_weights(
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
 
-        layer.process_after_load = True
         layer.logical_widths = output_partition_sizes
 
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
         # WEIGHT
         weight_dtype = (torch.float8_e4m3fn
                         if self.quant_config.is_checkpoint_fp8_serialized else
@@ -154,184 +149,262 @@ def create_weights(
         # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            self._create_scale_param(
-                scale_name="weight_scale",
-                layer=layer,
-                output_partition_sizes=output_partition_sizes,
-                **extra_weight_attrs)
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **extra_weight_attrs)
+            layer.register_parameter("weight_scale", scale)
 
             # INPUT ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
-                self._create_scale_param(
-                    scale_name="input_scale",
-                    layer=layer,
-                    output_partition_sizes=output_partition_sizes,
-                    **extra_weight_attrs)
-
-    def scales_shard_indexer(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-
-        if isinstance(shard_id, int):
-            pass
-        elif isinstance(shard_id, str):
-            if shard_id not in qkv_idxs:
-                raise ValueError(f"Unknown shard_id: {shard_id}")
-            shard_id = qkv_idxs[shard_id]
-        else:
-            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
-
-        return param[shard_id], loaded_weight
+                scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                      **extra_weight_attrs)
+                layer.register_parameter("input_scale", scale)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        if (not hasattr(layer, "process_after_load")
-                or not layer.process_after_load):
-            return
-
-        # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
+        # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
+
+            # Update the layer with the new values.
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            layer.logical_widths = None
             layer.input_scale = None
-            return
 
         # If checkpoint is fp8, requantize the separately quantized logical
         # weights into a single fp8 weight with a single weight scale.
         else:
-            # WEIGHT_SCALE / WEIGHT
-            #   Loop over logical weights, requantizing with single scale.
-            max_w_scale = layer.weight_scale.max()
-            start = 0
-            for idx, logical_width in enumerate(layer.logical_widths):
-                end = start + logical_width
-                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
-                                                  layer.weight_scale[idx])
-
-                layer.weight[start:end, :] = per_tensor_quantize(
-                    weight_dq, layer.weight_scale.max())
-                start = end
-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            # Dequant -> Quant with max scale.
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
 
-            # WEIGHT
-            #   Transpose weight for passing to torch._scaled_mm
-            weight = layer.weight
+            # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
-
-            # INPUT ACTIVATION SCALE
-            #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
-            #   Static:  set to max of the input_scales (since they are equal).
-            if self.quant_config.activation_scheme == "dynamic":
-                layer.input_scale = None
-            elif self.quant_config.activation_scheme == "static":
-                if not all_close_1d(layer.input_scale):
-                    raise ValueError(
-                        "All the input_scales for the logical weights of a "
-                        f"layer must be equal. But got {layer.input_scale}")
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            if self.quant_config.activation_scheme == "static":
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
             else:
-                raise ValueError(
-                    f"Unknown scheme {self.quant_config.activation_scheme}")
+                layer.input_scale = None
+
+        if self.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale
 
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        # ops.scaled_fp8_quant supports both dynamic and static quant.
-        #   If dynamic, layer.input_scale is None and x_scale computed from x.
-        #   If static, layer.input_scale is scalar and x_scale is input_scale.
-
-        if bias is None and self.cutlass_fp8_supported:
-            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
-
-            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm(
-                qinput,
-                layer.weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=layer.weight_scale,
-            )
-
-        else:
-            qinput, x_scale = ops.scaled_fp8_quant(x,
-                                                   layer.input_scale,
-                                                   batch_dim_padding=17)
-
-            # Fused GEMM_DQ -- note we padded the input above because
-            # torch._scaled_mm is more performant for matrices with
-            # batch dimension > 16. Note that this could change
-            # in the future.
-            output, _ = torch._scaled_mm(
-                qinput,
-                layer.weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=layer.weight_scale,
-                bias=bias,
-            )
-
-        return torch.narrow(output, 0, 0, x.shape[0])
+        if self.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=False)
+
+
+class Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
 
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
 
-class Fp8KVCacheMethod(QuantizeMethodBase):
-    """Supports loading kv-cache scaling factors from FP8 checkpoints.
+    Args:
+        quant_config: The quantization config.
     """
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module):
-        """Create "weight" (aka kv_scale) for an attention layer.
+    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
+                       intermediate_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
 
-        Args:
-            layer: The layer that is using the QuantizeMethodBase factory.
-        """
-        # Initialize the KV cache scale to 1.0 as the default value.
-        # If the kv_scale appears in the checkpoint, it will be
-        # overwritten when loading weights.
-        layer.kv_scale = Parameter(torch.tensor(1.0), requires_grad=False)
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                  2,
+                                                  dtype=torch.float32),
+                                       requires_grad=False)
+        layer.register_parameter("w13_scale", w13_scale)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 dtype=torch.float32),
+                                      requires_grad=False)
+        layer.register_parameter("w2_scale", w2_scale)
+
+        # If loading fp8 checkpoint, pass the weight loaders.
+        # If loading an fp16 checkpoint, do not (we will quantize in
+        #   process_weights_after_loading()
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            set_weight_attrs(w13_scale, extra_weight_attrs)
+            set_weight_attrs(w2_scale, extra_weight_attrs)
 
-    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
-        raise RuntimeError("Fp8KVCacheMethod.apply should not be called.")
+        # INPUT_SCALES
+        if self.quant_config.activation_scheme == "static":
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "Found static activation scheme for checkpoint that "
+                    "was not serialized fp8.")
+
+            a13_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                      dtype=torch.float32),
+                                           requires_grad=False)
+            layer.register_parameter("a13_scale", a13_scale)
+            set_weight_attrs(a13_scale, extra_weight_attrs)
+
+            a2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                     dtype=torch.float32),
+                                          requires_grad=False)
+            layer.register_parameter("a2_scale", a2_scale)
+            set_weight_attrs(a2_scale, extra_weight_attrs)
+        else:
+            layer.a13_scale = None
+            layer.a2_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # If the kv-cache dtype is auto, we enforce the kv-scale to be 1.0
-        # regardless whether the kv-scale is available in the checkpoint.
-        if layer.kv_cache_dtype != "auto":
-            kv_scale = layer.kv_scale.to("cpu").tolist()
-            if not isinstance(kv_scale, float):
-                raise ValueError("Only support per-tensor scaling factor "
-                                 "for fp8 KV cache")
-            layer._kv_scale = kv_scale
-            if layer._kv_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype:
-                print_warning_once(
-                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "
-                    "cause accuracy issues. Please make sure kv-cache scaling "
-                    "factor is available in the fp8 checkpoint.")
-        del layer.kv_scale
-
-
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
-
-
-def per_tensor_quantize(tensor: torch.Tensor,
-                        inv_scale: Union[float, torch.Tensor]) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-    return qweight.to(torch.float8_e4m3fn)
-
-
-def per_tensor_dequantize(
-        tensor: torch.Tensor, inv_scale: Union[float,
-                                               torch.Tensor]) -> torch.Tensor:
-    fake_qweight = tensor.to(torch.float16)
-    dq_weight = fake_qweight * inv_scale
-    return dq_weight
+
+        # If checkpoint is fp16, quantize in place.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            w13_weight = torch.empty_like(layer.w13_weight.data,
+                                          dtype=torch.float8_e4m3fn)
+            w2_weight = torch.empty_like(layer.w2_weight.data,
+                                         dtype=torch.float8_e4m3fn)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_scale = torch.nn.Parameter(torch.ones(
+                layer.num_experts,
+                dtype=torch.float32,
+                device=w13_weight.device),
+                                                 requires_grad=False)
+            for expert in range(layer.num_experts):
+                w13_weight[expert, :, :], layer.w13_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], layer.w2_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w2_weight.data[expert, :, :])
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            return
+
+        # If checkpoint is fp8, we need to handle that the
+        # MoE kernels require single activation scale and single weight
+        # scale for w13 per expert.
+        else:
+            # Fp8 moe kernels require a single activation scale.
+            # We take the max of all the scales in case they differ.
+            if self.quant_config.activation_scheme == "static":
+                if layer.a13_scale is None or layer.a2_scale is None:
+                    raise ValueError(
+                        "QuantConfig has static quantization, but found "
+                        "activation scales are None.")
+                if (not all_close_1d(layer.a13_scale)
+                        or not all_close_1d(layer.a2_scale)):
+                    print_warning_once(
+                        "Found input_scales that are not equal for "
+                        "fp8 MoE layer. Using the maximum across experts "
+                        "for each layer. ")
+                layer.a13_scale = torch.nn.Parameter(layer.a13_scale.max(),
+                                                     requires_grad=False)
+                layer.a2_scale = torch.nn.Parameter(layer.a2_scale.max(),
+                                                    requires_grad=False)
+
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_scale.max(dim=1).values
+            for expert_id in range(layer.num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            layer.w13_scale = torch.nn.Parameter(max_w13_scales,
+                                                 requires_grad=False)
+            return
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
+
+        return fused_moe(x,
+                         layer.w13_weight,
+                         layer.w2_weight,
+                         router_logits,
+                         top_k,
+                         renormalize=renormalize,
+                         inplace=True,
+                         use_fp8=True,
+                         w1_scale=layer.w13_scale,
+                         w2_scale=layer.w2_scale,
+                         a1_scale=layer.a13_scale,
+                         a2_scale=layer.a2_scale,
+                         use_grouped_topk=use_grouped_topk,
+                         num_expert_group=num_expert_group,
+                         topk_group=topk_group)
+
+
+class Fp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index ae9f7019f0592..510c9dd49ef03 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.utils import set_weight_attrs
 
 
@@ -24,10 +25,12 @@ def __init__(
         weight_bits: int,
         group_size: int,
         desc_act: bool,
+        lm_head_quantized: bool,
     ) -> None:
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
         self.pack_factor = Fraction(32, self.weight_bits)
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
@@ -37,7 +40,8 @@ def __init__(
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}),"
+                f"lm_head_quantized={self.lm_head_quantized}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -61,11 +65,14 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
-        return cls(weight_bits, group_size, desc_act)
-
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["GPTQLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQLinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQLinearMethod(self)
         return None
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 599070f1550ca..5b4d614ae2e74 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,5 +1,3 @@
-import enum
-from enum import Enum
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -11,91 +9,43 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_gptq_marlin_linear, check_gptq_marlin_supported, marlin_is_k_full,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
+    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
+    verify_gptq_marlin_supported, verify_marlin_supports_shape)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 
 logger = init_logger(__name__)
 
-GPTQ_MARLIN_TILE = 16
-GPTQ_MARLIN_MIN_THREAD_N = 64
-GPTQ_MARLIN_MIN_THREAD_K = 128
-GPTQ_MARLIN_MAX_PARALLEL = 16
-
-GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
-GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
-GPTQ_MARLIN_SUPPORTED_SYM = [True]
-
-
-# Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits: int):
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-def get_pack_factor(num_bits: int):
-    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-            ), f"Unsupported num_bits = {num_bits}"
-    return 32 // num_bits
-
-
-def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
-                          group_size: int, num_bits: int):
-    scale_perm, scale_perm_single = get_scale_perms(num_bits)
-    if group_size < size_k and group_size != -1:
-        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-    else:
-        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
-    s = s.reshape((-1, size_n)).contiguous()
-
-    return s
-
 
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool) -> None:
+                 is_sym: bool, lm_head_quantized: bool) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
         self.weight_bits = weight_bits
+        self.pack_factor = 32 // self.weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
+        self.lm_head_quantized = lm_head_quantized
 
-        # Verify
-        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
-            raise ValueError(
-                f"Marlin does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
-                "are supported.")
-        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
-            raise ValueError(
-                f"Marlin does not support group_size = {self.group_size}. "
-                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
-        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
-            raise ValueError(
-                f"Marlin does not support is_sym = {self.is_sym}. "
-                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
-
-        # Init
-        self.pack_factor = get_pack_factor(weight_bits)
-        self.tile_size = GPTQ_MARLIN_TILE
-        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
-        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
-        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
+        # Verify supported on platform.
+        verify_gptq_marlin_supported(num_bits=self.weight_bits,
+                                     group_size=self.group_size,
+                                     is_sym=self.is_sym)
 
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -119,12 +69,15 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        return cls(weight_bits, group_size, desc_act, is_sym)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym,
+                   lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
-        can_convert = cls.is_marlin_compatible(hf_quant_cfg)
+        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
         is_valid_user_quant = (user_quant is None or user_quant == "marlin")
 
@@ -141,10 +94,10 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(
-            self,
-            layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQMarlinLinearMethod(self)
         return None
 
@@ -152,33 +105,27 @@ def get_scaled_act_names(self) -> List[str]:
         return []
 
     @classmethod
-    def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
+    def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits", None)
         group_size = quant_config.get("group_size", None)
         sym = quant_config.get("sym", None)
         desc_act = quant_config.get("desc_act", None)
 
+        if quant_method != "gptq":
+            return False
+
         # If we cannot find the info needed in the config, cannot convert.
         if (num_bits is None or group_size is None or sym is None
                 or desc_act is None):
             return False
 
-        # If the capability of the device is too low, cannot convert.
-        major, minor = torch.cuda.get_device_capability()
-        device_capability = major * 10 + minor
-        if device_capability < cls.get_min_capability():
-            return False
-
-        # Otherwise, can convert if model satisfies marlin constraints.
-        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
-                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
-
-
-class GPTQMarlinState(Enum):
-    REPACK = enum.auto()
-    READY = enum.auto()
+        return check_gptq_marlin_supported(
+            num_bits=num_bits,
+            group_size=group_size,
+            is_sym=sym,
+            min_capability=cls.get_min_capability())
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -202,6 +149,8 @@ def create_weights(
         **extra_weight_attrs,
     ) -> None:
         del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
 
         # Normalize group_size
         if self.quant_config.group_size != -1:
@@ -209,58 +158,25 @@ def create_weights(
         else:
             group_size = input_size
 
-        # Validate dtype
-        if params_dtype not in [torch.float16, torch.bfloat16]:
-            raise ValueError(f"The params dtype must be float16 "
-                             f"or bfloat16, but got {params_dtype}")
-
-        # Validate output_size_per_partition
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.min_thread_n != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f" min_thread_n = {self.quant_config.min_thread_n}.")
-
-        # Validate input_size_per_partition
-        if input_size_per_partition % self.quant_config.min_thread_k != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = "
-                f"{input_size_per_partition} is not divisible "
-                f"by min_thread_k = {self.quant_config.min_thread_k}.")
-
-        if (group_size < input_size
-                and input_size_per_partition % group_size != 0):
-            raise ValueError(
-                f"Weight input_size_per_partition = {input_size_per_partition}"
-                f" is not divisible by group_size = {group_size}.")
-
-        # Detect sharding of scales/zp
-
-        # By default, no sharding over "input dim"
-        scales_and_zp_size = input_size // group_size
-        scales_and_zp_input_dim = None
-
-        if self.quant_config.desc_act:
-            # Act-order case
-            assert self.quant_config.group_size != -1
-
-            is_k_full = input_size_per_partition == input_size
-
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                             self.quant_config.group_size,
+                                             is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
         else:
-            # No act-order case
-
-            # K is always full due to full alignment with
-            # group-size and shard of scales/zp
-            is_k_full = True
-
-            # If this is a row-parallel case, then shard scales/zp
-            if (input_size != input_size_per_partition
-                    and self.quant_config.group_size != -1):
-                scales_and_zp_size = input_size_per_partition // group_size
-                scales_and_zp_input_dim = 0
-
-        # Init buffers
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
 
         # Quantized weights
         qweight = Parameter(
@@ -299,11 +215,6 @@ def create_weights(
             },
         )
 
-        g_idx_sort_indices = torch.empty(
-            g_idx.shape,
-            dtype=torch.int32,
-        )
-
         # Scales
         scales = Parameter(
             torch.empty(
@@ -343,25 +254,55 @@ def create_weights(
             },
         )
 
-        # Allocate marlin workspace
-        max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_thread_n) * self.quant_config.max_parallel
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                requires_grad=False)
-
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
         layer.register_parameter("scales", scales)
         layer.register_parameter("qzeros", qzeros)
-        layer.g_idx_sort_indices = g_idx_sort_indices
-        layer.workspace = workspace
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
-        layer.is_k_full = is_k_full
-        layer.marlin_state = GPTQMarlinState.REPACK
+        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
+                                           is_row_parallel)
+
+    # Checkpoints are serialized in AutoGPTQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking, including the activation reordering case.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
+
+        # Handle sorting for activation reordering if needed.
+        if self.quant_config.desc_act:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+            replace_tensor(layer, "g_idx", g_idx)
+        else:
+            layer.g_idx = marlin_make_empty_g_idx(device)
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        # No zero-point
+        layer.zp = marlin_make_empty_g_idx(device)
+
+        # Repack weights from autogptq format to marlin format.
+        marlin_qweight = ops.gptq_marlin_repack(
+            layer.qweight,
+            perm=layer.g_idx_sort_indices,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.weight_bits)
+        replace_tensor(layer, "qweight", marlin_qweight)
+
+        # Permute scales from autogptq format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size)
+        replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
         self,
@@ -369,90 +310,16 @@ def apply(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        reshaped_x = x.reshape(-1, x.shape[-1])
-
-        size_m = reshaped_x.shape[0]
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-        full_size_k = layer.input_size
-
-        out_shape = x.shape[:-1] + (part_size_n, )
-
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            cur_device = layer.qweight.device
-
-            # Process act_order
-            if self.quant_config.desc_act:
-                # Get sorting based on g_idx
-                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
-
-                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
-
-                replace_tensor("g_idx", sorted_g_idx)
-                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
-
-            else:
-                # Reset g_idx related tensors
-                layer.g_idx = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-                layer.g_idx_sort_indices = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-
-            # Repack weights
-            marlin_qweight = ops.gptq_marlin_repack(
-                layer.qweight,
-                layer.g_idx_sort_indices,
-                part_size_k,
-                part_size_n,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("qweight", marlin_qweight)
-
-            # Permute scales
-            scales_size_k = part_size_k
-            scales_size_n = part_size_n
-            if self.quant_config.desc_act:
-                scales_size_k = full_size_k
-
-            marlin_scales = marlin_permute_scales(
-                layer.scales,
-                scales_size_k,
-                scales_size_n,
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("scales", marlin_scales)
-
-        output = ops.gptq_marlin_gemm(
-            reshaped_x,
-            layer.qweight,
-            layer.scales,
-            layer.g_idx,
-            layer.g_idx_sort_indices,
-            layer.workspace,
-            self.quant_config.weight_bits,
-            size_m,
-            part_size_n,
-            part_size_k,
-            layer.is_k_full,
-        )
-
-        if bias is not None:
-            output.add_(bias)  # In-place add
-
-        return output.reshape(out_shape)
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            weight_zp=layer.zp,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            num_bits=self.quant_config.weight_bits,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            is_k_full=layer.is_k_full,
+            bias=bias)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index 6bcfc405afe71..e708c4da95af3 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -109,9 +109,8 @@ def override_quantization_method(cls, hf_quant_cfg,
 
         return None
 
-    def get_quant_method(
-            self,
-            layer: torch.nn.Module) -> Optional["GPTQMarlin24LinearMethod"]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQMarlin24LinearMethod"]:
         if isinstance(layer, LinearBase):
             return GPTQMarlin24LinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
new file mode 100644
index 0000000000000..c1495711447fa
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -0,0 +1,78 @@
+import torch
+
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.utils import print_warning_once
+
+
+class BaseKVCacheMethod(QuantizeMethodBase):
+    """
+    Quant method that adds `_k_scale` and `_v_scale` attributes to the
+    Attention layer to support loading those scaling factors from checkpoints. 
+    The k/v_scale will be used to:
+        - quantize k/v_cache entries before saving them to the cache
+        - dequantize k/v_cache entries before fetching them from the cache
+
+    :param quant_config: the appropriate QuantizationConfig 
+    """
+
+    def __init__(self, quant_config: QuantizationConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module):
+        """
+        Create "weight" (aka k_scale and v_scale) for an attention layer.
+        """
+        # Initialize the KV cache scales to -1.0, which is an invalid value.
+        # If the k/v_scale appears in the checkpoint, it will be
+        # overwritten when loading weights.
+        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+
+    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
+        raise RuntimeError(
+            f"{self.__class__.__name__}.apply should not be called.")
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
+        # regardless whether the kv-scale is available in the checkpoint.
+        if layer.kv_cache_dtype != "auto":
+            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+                # We prefer to use separate k_scale and v_scale if present
+                k_scale = layer.k_scale.to("cpu").tolist()
+                v_scale = layer.v_scale.to("cpu").tolist()
+            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+                # If no scales were loaded (both scales are invalid negative
+                # values), use the default value of 1.0
+                k_scale = torch.nn.Parameter(torch.tensor(1.0),
+                                             requires_grad=False)
+                v_scale = torch.nn.Parameter(torch.tensor(1.0),
+                                             requires_grad=False)
+            else:
+                # If we find a single kv_scale in the checkpoint, we remap
+                # kv_scale to k_scale during weight loading, and duplicate
+                # k_scale to v_scale here
+                assert layer.k_scale > 0.0
+                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+                k_scale = scale_to_duplicate.to("cpu").tolist()
+                v_scale = scale_to_duplicate.to("cpu").tolist()
+
+            if not isinstance(k_scale, float) or not isinstance(
+                    v_scale, float):
+                raise ValueError("Only support per-tensor scaling factor "
+                                 "for fp8 KV cache")
+
+            # These are used in the final Attention.forward()
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+            if (layer._k_scale == 1.0 and layer._v_scale == 1.0
+                    and "e5m2" not in layer.kv_cache_dtype):
+                print_warning_once(
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
+                    "may cause accuracy issues. Please make sure k/v_scale "
+                    "scaling factors are available in the fp8 checkpoint.")
+
+        del layer.k_scale
+        del layer.v_scale
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 3613c9d9ecf2a..cdc5129a93b15 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -22,9 +23,11 @@ class MarlinConfig(QuantizationConfig):
     def __init__(
         self,
         group_size: int,
+        lm_head_quantized: bool,
     ) -> None:
         # Group size for the quantization.
         self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
         if self.group_size != 128 and self.group_size != -1:
             raise ValueError(
                 "Currently, only group size 128 and -1 (channelwise) "
@@ -51,7 +54,8 @@ def __init__(
         self.perm_len = 1024
 
     def __repr__(self) -> str:
-        return f"MarlinConfig(group_size={self.group_size})"
+        return (f"MarlinConfig(group_size={self.group_size}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -73,7 +77,9 @@ def get_config_filenames(cls) -> List[str]:
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
-        return cls(group_size)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(group_size, lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -94,9 +100,10 @@ def override_quantization_method(cls, hf_quant_cfg,
 
         return None
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["MarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["MarlinLinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return MarlinLinearMethod(self)
         return None
 
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 207dbcee8afc5..afb3c04976737 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -39,7 +39,8 @@ def get_name(self) -> str:
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
@@ -51,8 +52,8 @@ def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig":
         weight_bits = cls.get_from_keys(config, ["wbits"])
         return cls(weight_bits)
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional[QuantizeMethodBase]:
         if isinstance(layer, LinearBase):
             return SqueezeLLMLinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
deleted file mode 100644
index 93f65a20d4e4a..0000000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits: int):
-    perm_list: List[int] = []
-    for i in range(32):
-        perm1: List[int] = []
-        col = i // 4
-        col_o = col // 2
-        for block in [0, 1]:
-            for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
-                             4 * block)
-        for j in range(4):
-            perm_list.extend([p + 1 * j for p in perm1])
-    perm = numpy.array(perm_list)
-
-    if num_bits == 4:
-        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    elif num_bits == 8:
-        interleave = numpy.array([0, 2, 1, 3])
-    else:
-        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
-    perm = torch.from_numpy(perm)
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single: List[int] = []
-    for i in range(8):
-        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
-    return perm, scale_perm, scale_perm_single
-
-
-marlin_24_perm: Dict[int, torch.Tensor] = {}
-marlin_24_scale_perm: Dict[int, List[int]] = {}
-marlin_24_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
-    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
-    marlin_24_perm[num_bits] = perm_24
-    marlin_24_scale_perm[num_bits] = scale_perm_24
-    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
deleted file mode 100644
index db5e6857a8846..0000000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits: int):
-    perm_list: List[int] = []
-    for i in range(32):
-        perm1: List[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
-
-    perm = numpy.array(perm_list)
-
-    if num_bits == 4:
-        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    elif num_bits == 8:
-        interleave = numpy.array([0, 2, 1, 3])
-    else:
-        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
-    perm = torch.from_numpy(perm)
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return perm, scale_perm, scale_perm_single
-
-
-marlin_perm: Dict[int, torch.Tensor] = {}
-marlin_scale_perm: Dict[int, List[int]] = {}
-marlin_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
-    perm, scale_perm, scale_perm_single = get_perms(num_bits)
-    marlin_perm[num_bits] = perm
-    marlin_scale_perm[num_bits] = scale_perm
-    marlin_scale_perm_single[num_bits] = scale_perm_single
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0d027d0620ab3..25a7cd7bde653 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,64 +1,172 @@
-"""This file is used for /tests and /benchmarks"""
-import random
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
 
-from vllm.model_executor.layers.quantization.utils.format_24 import (
-    mask_creator, sparse_semi_structured_from_dense_cutlass)
-from vllm.model_executor.layers.quantization.utils.marlin_24_perms import (
-    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
-    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    get_pack_factor, quantize_weights, sort_weights)
-
-__cuda_arch = torch.cuda.get_device_capability()
-
-MARLIN_TILE = 16
-
-
-def is_marlin_supported():
-    return __cuda_arch[0] >= 8
-
-
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
-    assert q_w.shape == (size_k, size_n)
-    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
-    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
-
-    # Permute weights to 16x64 marlin tiles
-    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
-    q_w = q_w.permute((0, 2, 1, 3))
-    q_w = q_w.reshape((size_k // tile, size_n * tile))
-
-    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
-
-    return q_w
-
-
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
-    # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
-
-    # Pack
-    pack_factor = get_pack_factor(num_bits)
-    orig_device = q_w.device
-
-    q_w = q_w.cpu().numpy().astype(numpy.uint32)
-
-    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
-                           dtype=numpy.uint32)
-    for i in range(pack_factor):
-        q_packed |= q_w[:, i::pack_factor] << num_bits * i
-
-    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
-
-    return q_packed
-
-
-def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
-                          scale_perm_single):
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+from .quant_utils import pack_cols, unpack_cols
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+MARLIN_SUPPORTED_NUM_BITS = [4, 8]
+MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
+                            min_capability: Optional[int],
+                            has_zp: bool) -> Tuple[bool, Optional[str]]:
+    if min_capability is not None:
+        major, minor = current_platform.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < min_capability:
+            return (False, "Marlin does not support device_capability = {}"
+                    ", the min_capability required is {}".format(
+                        device_capability, min_capability))
+
+    if num_bits not in MARLIN_SUPPORTED_NUM_BITS:
+        return (False, "Marlin does not support weight_bits = {}. "
+                "Only weight_bits = {} are supported.".format(
+                    num_bits, MARLIN_SUPPORTED_NUM_BITS))
+
+    if group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+        return (False, "Marlin does not support group_size = {}. Only "
+                "group_sizes = {} are supported.".format(
+                    group_size, MARLIN_SUPPORTED_GROUP_SIZES))
+
+    if not has_zp and not is_sym:
+        return (False,
+                "Marlin without zero_points must have symmetric quantization")
+
+    return True, None
+
+
+def check_gptq_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
+                                min_capability: int) -> bool:
+    cond, _ = _check_marlin_supported(num_bits,
+                                      group_size,
+                                      is_sym,
+                                      min_capability,
+                                      has_zp=False)
+    return cond
+
+
+def check_awq_marlin_supported(num_bits: int, group_size: int, has_zp: bool,
+                               min_capability: int) -> bool:
+    cond, _ = _check_marlin_supported(num_bits,
+                                      group_size,
+                                      False,
+                                      min_capability,
+                                      has_zp=has_zp)
+    return cond
+
+
+def verify_gptq_marlin_supported(num_bits: int, group_size: int,
+                                 is_sym: bool) -> None:
+    cond, err_msg = _check_marlin_supported(num_bits,
+                                            group_size,
+                                            is_sym,
+                                            min_capability=None,
+                                            has_zp=False)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError("GPTQ" + err_msg)
+
+
+def verify_awq_marlin_supported(num_bits: int, group_size: int,
+                                has_zp: bool) -> None:
+    cond, err_msg = _check_marlin_supported(num_bits,
+                                            group_size,
+                                            False,
+                                            min_capability=None,
+                                            has_zp=has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError("AWQ" + err_msg)
+
+
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+
+
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def get_scale_perms():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
@@ -68,157 +176,128 @@ def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
     return s
 
 
-def marlin_quantize(
-    w: torch.Tensor,
-    num_bits: int,
-    group_size: int,
-    act_order: bool,
-):
-    size_k, size_n = w.shape
-
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
-                                                       act_order)
-
-    # For act_order, sort the "weights" and "g_idx" so that group ids are
-    # increasing
-    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
-    if act_order:
-        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
-
-    # Reformat to marlin
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
-                                marlin_perm[num_bits])
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
-                                     marlin_scale_perm[num_bits],
-                                     marlin_scale_perm_single[num_bits])
-
-    # Create result
-    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
-    for i in range(len(res_list)):
-        res_list[i] = res_list[i].to(w.device)
-
-    return res_list
-
-
-def inject_24(w, size_k, size_n):
-    assert w.shape == (size_k, size_n)
-
-    mask = mask_creator(w.t()).t().cuda().bool()
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    scale_perm, _ = get_scale_perms()
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
 
-    return (mask * w).contiguous(), mask.contiguous()
-
-
-def check_24(w, num_rows_to_sample=50, _verbose=False):
-    BLOCK_SIZE = 4
-    MAX_NON_ZEROS = 2
-
-    w = w.t().contiguous()
-
-    print("check_24: w.shape = {}".format(w.shape))
-
-    num_rows, num_cols = w.shape
-    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
-    if _verbose:
-        print(f"Sampled row idxs = {sampled_row_idxs}")
-
-    total_segments = 0
-    non_24_segments = 0
-    for i in sampled_row_idxs:
-        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
-            total_segments += 1
-            block = w[i, j:j + BLOCK_SIZE]
-            num_nonzero = torch.count_nonzero(block)
-            if num_nonzero > MAX_NON_ZEROS:
-                print("i = {} j = {} block = {}".format(i, j, block))
-                non_24_segments += 1
-
-    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
-
-
-def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
-    assert q_24.shape == (size_k, size_n)
-
-    # Remove zp to normalize over 0
-    max_q_val = (1 << num_bits) - 1
-    zp = (max_q_val + 1) // 2
-    q_24_no_zp = q_24 - zp
-
-    # Compress
-    q_24_no_zp = q_24_no_zp.t().contiguous()
-    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
-        q_24_no_zp)
-    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
-
-    # Restore zp
-    q_24_comp = q_24_no_zp_comp + zp
-
-    # Resize meta to its actual shape (without moving any data)
-    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
-
-    return q_24_comp, meta
-
-
-def marlin_24_quantize(
-    w: torch.Tensor,
-    num_bits: int,
-    group_size: int,
-):
-    size_k, size_n = w.shape
-
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    # Inject 2:4 sparsity
-    w_24, mask_24 = inject_24(w, size_k, size_n)
-
-    # Quantize
-    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
-                                                             num_bits,
-                                                             group_size,
-                                                             act_order=False)
-
-    # Compress quantized weight
-    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
-                                                     num_bits)
-    size_k_comp = size_k // 2
-
-    # Reformat to marlin
-    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
-                                        num_bits, marlin_24_perm[num_bits])
-    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
-                                        marlin_24_scale_perm[num_bits],
-                                        marlin_24_scale_perm_single[num_bits])
-
-    # Create result
-    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
-    for i in range(len(res_list)):
-        res_list[i] = res_list[i].to(w.device)
-
-    return res_list
-
-
-def compute_max_diff(output, output_ref):
-    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
 
-class MarlinWorkspace:
+    return zp
 
-    def __init__(self, out_features, min_thread_n, max_parallel):
-        assert (out_features % min_thread_n == 0), (
-            "out_features = {} is undivisible by min_thread_n = {}".format(
-                out_features, min_thread_n))
 
-        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
 
-        self.scratch = torch.zeros(max_workspace_size,
-                                   dtype=torch.int,
-                                   device="cuda")
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_tensor(layer: torch.nn.Module, name: str,
+                   new_t: torch.Tensor) -> None:
+    # It is important to use resize_() here since it ensures
+    # the same buffer is reused
+    getattr(layer, name).resize_(new_t.shape)
+    getattr(layer, name).copy_(new_t)
+    del new_t
+
+
+def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        num_bits: int,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  weight,
+                                  weight_scale,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  num_bits,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  has_zp=False)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        num_bits: int,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  weight,
+                                  weight_scale,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  num_bits,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=True,
+                                  has_zp=True)
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
new file mode 100644
index 0000000000000..c878939580f10
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -0,0 +1,115 @@
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
+
+from .marlin_utils import marlin_make_workspace, marlin_permute_scales
+
+
+def is_fp8_marlin_supported():
+    capability = current_platform.get_device_capability()
+    return capability[0] >= 8
+
+
+def apply_fp8_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    output = ops.fp8_marlin_gemm(
+        a=reshaped_x,
+        b_q_weight=weight,
+        b_scales=weight_scale,
+        workspace=workspace,
+        num_bits=8,
+        size_m=reshaped_x.shape[0],
+        size_n=size_n,
+        size_k=size_k,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+    print_warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace(part_size_n, device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=pack_fp8_to_int32(
+        layer.weight),
+                                            perm=torch.empty(0,
+                                                             dtype=torch.int,
+                                                             device=device),
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=8)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Currently Marlin doesn't support per-tensor scales, so we
+    # expand it to channelwise
+    is_channelwise = (len(layer.weight_scale.shape) > 0
+                      and layer.weight_scale.shape[0] == part_size_n)
+    if is_channelwise:
+        scales = layer.weight_scale
+    else:
+        scales = layer.weight_scale.repeat(1, part_size_n)
+    scales = scales.to(layer.orig_dtype).to(device)
+
+    # Permute scales
+    marlin_scales = marlin_permute_scales(s=scales,
+                                          size_k=part_size_k,
+                                          size_n=part_size_n,
+                                          group_size=-1)
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.shape[0] % 4 == 0
+
+    # Reshape to prepare for packing
+    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+    # Convert fp8 to uint8 (byte) representation
+    byte_tensor = reshaped.view(torch.uint8)
+
+    # Pack 4 uint8 values into one int32
+    packed = (byte_tensor[:, 0].to(torch.int32) |
+              (byte_tensor[:, 1].to(torch.int32) << 8) |
+              (byte_tensor[:, 2].to(torch.int32) << 16) |
+              (byte_tensor[:, 3].to(torch.int32) << 24))
+
+    return packed.view(fp8_tensor.shape[0] // 4,
+                       *fp8_tensor.shape[1:]).contiguous()
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
new file mode 100644
index 0000000000000..541d148c761fc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -0,0 +1,151 @@
+"""Utility functions used for tests and benchmarks"""
+
+from typing import List
+
+import numpy as np
+import torch
+
+from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
+                           marlin_zero_points)
+from .quant_utils import (get_pack_factor, quantize_weights,
+                          quantize_weights_with_zp, sort_weights)
+
+
+class MarlinWorkspace:
+
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (out_features % min_thread_n == 0), (
+            "out_features = {} is undivisible by min_thread_n = {}".format(
+                out_features, min_thread_n))
+
+        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+
+        self.scratch = torch.zeros(max_workspace_size,
+                                   dtype=torch.int,
+                                   device="cuda")
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+    return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(np.uint32)
+
+    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                        dtype=np.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = np.array(perm_list)
+
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
+                    act_order: bool):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+                                                       act_order)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
+def awq_marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Detect num groups
+    assert size_k % group_size == 0
+    num_groups = size_k // group_size
+
+    # Quantize with zp
+    w_ref, q_w, s, zp = quantize_weights_with_zp(w, num_bits, group_size)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n, num_bits)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/format_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
similarity index 71%
rename from vllm/model_executor/layers/quantization/utils/format_24.py
rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 01c8cf789204b..648c32249a571 100644
--- a/vllm/model_executor/layers/quantization/utils/format_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,9 +1,14 @@
-#
-# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
-#
+"""Utility functions used for tests and benchmarks"""
 
+import random
+from typing import List
+
+import numpy
 import torch
 
+from .marlin_utils_test import marlin_weights
+from .quant_utils import quantize_weights
+
 
 # This is PyTorch implementation of main part of reorder_meta()
 # function, from tools/util/include/cutlass/util/host_reorder.h file
@@ -306,3 +311,155 @@ def mask_creator(tensor):
     mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
 
     return mask
+
+
+def inject_24(w, size_k, size_n):
+    assert w.shape == (size_k, size_n)
+
+    mask = mask_creator(w.t()).t().cuda().bool()
+
+    return (mask * w).contiguous(), mask.contiguous()
+
+
+def check_24(w, num_rows_to_sample=50, _verbose=False):
+    BLOCK_SIZE = 4
+    MAX_NON_ZEROS = 2
+
+    w = w.t().contiguous()
+
+    print("check_24: w.shape = {}".format(w.shape))
+
+    num_rows, num_cols = w.shape
+    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
+    if _verbose:
+        print(f"Sampled row idxs = {sampled_row_idxs}")
+
+    total_segments = 0
+    non_24_segments = 0
+    for i in sampled_row_idxs:
+        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
+            total_segments += 1
+            block = w[i, j:j + BLOCK_SIZE]
+            num_nonzero = torch.count_nonzero(block)
+            if num_nonzero > MAX_NON_ZEROS:
+                print("i = {} j = {} block = {}".format(i, j, block))
+                non_24_segments += 1
+
+    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
+
+
+def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+    assert q_24.shape == (size_k, size_n)
+
+    # Remove zp to normalize over 0
+    max_q_val = (1 << num_bits) - 1
+    zp = (max_q_val + 1) // 2
+    q_24_no_zp = q_24 - zp
+
+    # Compress
+    q_24_no_zp = q_24_no_zp.t().contiguous()
+    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
+        q_24_no_zp)
+    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
+
+    # Restore zp
+    q_24_comp = q_24_no_zp_comp + zp
+
+    # Resize meta to its actual shape (without moving any data)
+    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
+
+    return q_24_comp, meta
+
+
+def get_scale_perms_24():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
+    scale_perm_single: List[int] = []
+    for i in range(8):
+        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
+    return scale_perm, scale_perm_single
+
+
+def get_weight_perm_24(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        col_o = col // 2
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
+                             4 * block)
+        for j in range(4):
+            perm_list.extend([p + 1 * j for p in perm1])
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
+                             group_size: int) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms_24()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_24_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Inject 2:4 sparsity
+    w_24, mask_24 = inject_24(w, size_k, size_n)
+
+    # Quantize
+    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
+                                                             num_bits,
+                                                             group_size,
+                                                             act_order=False)
+
+    # Compress quantized weight
+    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
+                                                     num_bits)
+    size_k_comp = size_k // 2
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm_24(num_bits)
+    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
+                                        num_bits, weight_perm)
+    marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 177cb23f63cf4..7abe919f859ca 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -106,6 +106,67 @@ def reshape_w(w):
     )
 
 
+def quantize_weights_with_zp(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    max_q_val = 2**num_bits - 1
+    min_q_val = 0
+
+    # Reshape to [groupsize, -1]
+    if group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max = torch.max(w, 0, keepdim=True)[0]
+    min = torch.min(w, 0, keepdim=True)[0]
+    s = (max - min).clamp(min=1e-5) / max_q_val
+
+    # Compute zero-point for each group
+    zp = (-torch.round(min / s)).clamp(min_q_val, max_q_val).int()
+
+    # Quantize
+    q_w = torch.round(w / s).int() + zp
+    q_w = torch.clamp(q_w, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    w_ref = (q_w - zp).half() * s
+
+    # Restore original shapes
+    if group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+
+    s = s.reshape((-1, size_n)).contiguous()
+    zp = zp.reshape((-1, size_n)).contiguous()
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s.to(device=orig_device),
+        zp.to(device=orig_device),
+    )
+
+
 def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     orig_device = q_w.device
 
@@ -122,7 +183,7 @@ def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     )
 
 
-def gptq_pack(
+def pack_rows(
     q_w: torch.Tensor,
     num_bits: int,
     size_k: int,
@@ -144,3 +205,90 @@ def gptq_pack(
 
     q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
     return q_res
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k, size_n // pack_factor
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor)
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def gptq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    return pack_rows(q_w, num_bits, size_k, size_n)
+
+
+def awq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
+    q_w = q_w.reshape((-1, size_n)).contiguous()
+
+    return pack_cols(q_w, num_bits, size_k, size_n)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
new file mode 100644
index 0000000000000..20100c76bd690
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -0,0 +1,208 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+def cutlass_fp8_supported() -> bool:
+    capability = current_platform.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+
+    return ops.cutlass_scaled_mm_supports_fp8(capability)
+
+
+def per_tensor_dequantize(
+        tensor: torch.Tensor, inv_scale: Union[float,
+                                               torch.Tensor]) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def create_per_tensor_scale_param(
+    output_partition_sizes: List[int],
+    **extra_weight_attrs,
+) -> Parameter:
+    scale = Parameter(torch.empty(len(output_partition_sizes),
+                                  dtype=torch.float32),
+                      requires_grad=False)
+    scale[:] = torch.finfo(torch.float32).min
+    set_weight_attrs(scale, {
+        "needs_scalar_to_array": True,
+        **extra_weight_attrs
+    })
+    return scale
+
+
+def create_per_channel_scale_param(output_partition_sizes: List[int],
+                                   **extra_weight_attrs) -> Parameter:
+    scale = Parameter(torch.empty((sum(output_partition_sizes), 1),
+                                  dtype=torch.float32),
+                      requires_grad=False)
+    scale[:] = torch.finfo(torch.float32).min
+    set_weight_attrs(scale, {"output_dim": 0, **extra_weight_attrs})
+    return scale
+
+
+def convert_to_channelwise(
+        weight_scale: torch.Tensor,
+        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty((sum(logical_widths), 1),
+                                       dtype=torch.float32,
+                                       device=weight_scale.device)
+
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+
+    return weight_scale_channel
+
+
+def requantize_with_max_scale(
+        weight: torch.Tensor, weight_scale: torch.Tensor,
+        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
+        torch.float8_e4m3fn).min)
+
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :],
+                                              weight_scale[idx])
+            weight[start:end, :], _ = ops.scaled_fp8_quant(
+                weight_dq, max_w_scale)
+            start = end
+
+    return max_w_scale, weight
+
+
+def apply_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    input_scale_ub: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = True,
+    use_per_token_if_dynamic: bool = False,
+) -> torch.Tensor:
+    # ops.scaled_fp8_quant supports both dynamic and static quant.
+    #   If dynamic, layer.input_scale is None and x_scale computed from x.
+    #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+    if cutlass_fp8_supported:
+        qinput, x_scale = ops.scaled_fp8_quant(
+            input,
+            input_scale,
+            scale_ub=input_scale_ub,
+            use_per_token_if_dynamic=use_per_token_if_dynamic)
+
+        # Fused GEMM_DQ
+        return ops.cutlass_scaled_mm(qinput,
+                                     weight,
+                                     out_dtype=input.dtype,
+                                     scale_a=x_scale,
+                                     scale_b=weight_scale,
+                                     bias=bias)
+
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    else:
+        # Note: we pad the input because torch._scaled_mm is more performant
+        # for matrices with batch dimension > 16.
+        # This could change in the future.
+        qinput, x_scale = ops.scaled_fp8_quant(
+            input,
+            input_scale,
+            batch_dim_padding=17,
+            use_per_token_if_dynamic=use_per_token_if_dynamic)
+
+        per_tensor_weights = (weight_scale.numel() == 1)
+        per_tensor_activations = (x_scale.numel() == 1)
+
+        if per_tensor_weights and per_tensor_activations:
+            # Fused GEMM_DQ
+            output, _ = torch._scaled_mm(qinput,
+                                         weight,
+                                         out_dtype=input.dtype,
+                                         scale_a=x_scale,
+                                         scale_b=weight_scale,
+                                         bias=bias)
+            return torch.narrow(output, 0, 0, input.shape[0])
+
+        else:
+            # Fallback for channelwise case, where we use unfused DQ
+            # due to limitations with scaled_mm
+
+            # Symmetric quantized GEMM by definition computes the following:
+            #   C = (s_x * X) (s_w * W) + bias
+            # This is equivalent to dequantizing the weights and activations
+            # before applying a GEMM.
+            #
+            # In order to compute quantized operands, a quantized kernel
+            # will rewrite the above like so:
+            #   C = s_w * s_x * (X * W) + bias
+            #
+            # For the scaled_mm fallback case, we break this down, since it
+            # does not support s_w being a vector.
+
+            # GEMM
+            # This computes C = (X * W).
+            # Output in fp32 to allow subsequent ops to happen in-place
+            output, _ = torch._scaled_mm(qinput,
+                                         weight,
+                                         out_dtype=torch.float32)
+            # Unpad (undo batch_dim_padding)
+            output = torch.narrow(output, 0, 0, input.shape[0])
+
+            # DQ
+            # C = sw * sx * (X * W) + bias
+            output = output * x_scale * weight_scale.t()
+            if bias is not None:
+                output = output + bias
+            return output.to(dtype=input.dtype)
+
+
+def apply_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+):
+    # ops.scaled_int8_quant supports both dynamic and static quant.
+    # * dynamic, layer.input_scale is None and x_scale computed from x.
+    # * static, layer.input_scale is scalar and x_scale is input_scale.
+    x_q, x_scale = ops.scaled_int8_quant(input, input_scale)
+
+    return ops.cutlass_scaled_mm(x_q,
+                                 weight,
+                                 scale_a=x_scale,
+                                 scale_b=weight_scale,
+                                 out_dtype=input.dtype,
+                                 bias=bias)
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index fe9b2fac1117e..b4994083c797b 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,15 +1,14 @@
 from functools import cached_property
-from typing import Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
-    SpecDecodeBaseSampler)
+    SpecDecodeStochasticBaseSampler)
 
 
-class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
+class RejectionSampler(SpecDecodeStochasticBaseSampler):
     """Apply modified rejection sampling as described in "Accelerating Large
         Language Model Decoding with Speculative Sampling"
         https://arxiv.org/pdf/2302.01318.pdf.
@@ -28,8 +27,8 @@ def __init__(self,
             during sampling. This catches correctness issues but adds
             nontrivial latency.
         """
-        SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode)
-        nn.Module.__init__(self)
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
@@ -37,6 +36,7 @@ def forward(
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
+        generators: List[Optional[torch.Generator]],
     ) -> torch.Tensor:
         """Sample token ids using rejection sampling. This accepts or rejects
         tokens proposed by the draft model using the probability of each token
@@ -78,11 +78,13 @@ def forward(
             self._raise_if_incorrect_input(target_probs, bonus_token_ids,
                                            draft_probs, draft_token_ids)
 
-        accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
-            target_probs,
-            draft_probs,
-            draft_token_ids,
-        )
+        accepted, recovered_token_ids = (
+            self._batch_modified_rejection_sampling(
+                target_probs,
+                draft_probs,
+                draft_token_ids,
+                generators,
+            ))
 
         output_token_ids = self._create_output(
             accepted,
@@ -94,10 +96,11 @@ def forward(
         return output_token_ids
 
     def _batch_modified_rejection_sampling(
-            self,
-            target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
-            draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
-            draft_token_ids: torch.Tensor,  # [batch_size, k]
+        self,
+        target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+        draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+        draft_token_ids: torch.Tensor,  # [batch_size, k]
+        generators: List[Optional[torch.Generator]],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Perform modified rejection sampling on each sequence.
 
@@ -114,22 +117,33 @@ def _batch_modified_rejection_sampling(
 
         # shape [batch_size, k]
         accepted = self._get_accepted(target_probs, draft_probs,
-                                      draft_token_ids)
+                                      draft_token_ids, generators)
 
         recovered_probs = self._get_recovered_probs(
             target_probs, draft_probs).reshape(batch_size * k, vocab_size)
 
+        seed_indices, non_seed_indices = self._split_batch_by_seeded(
+            generators, k=k)
+
         # NOTE: the recovered_probs are overwritten by this method.
-        recovered_token_ids = _multinomial(recovered_probs,
-                                           num_samples=1).reshape(
-                                               batch_size, k)
+        recovered_token_ids = _multinomial(
+            recovered_probs,
+            num_samples=1,
+            k=k,
+            generators=generators,
+            seed_indices=seed_indices,
+            # this arg is unused when None but torch.jit requires a list
+            non_seed_indices=non_seed_indices or [],
+        ).reshape(batch_size, k)
+
         return accepted, recovered_token_ids
 
     def _get_accepted(
-            self,
-            target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
-            draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
-            draft_token_ids: torch.Tensor,  # [batch_size, k]
+        self,
+        target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+        draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+        draft_token_ids: torch.Tensor,  # [batch_size, k]
+        generators: List[Optional[torch.Generator]],
     ) -> torch.Tensor:
         r"""Create bool matrix over the proposed draft tokens. If
         True, then a token can be accepted, else it should be
@@ -164,10 +178,28 @@ def _get_accepted(
         selected_target_probs = target_probs[batch_indices, probs_indicies,
                                              draft_token_ids]
 
-        uniform_rand = torch.rand(batch_size,
-                                  k,
-                                  dtype=self.probs_dtype,
-                                  device=target_probs.device)
+        seed_indices, non_seed_indices = self._split_batch_by_seeded(
+            generators)
+
+        if len(seed_indices) == 0:
+            uniform_rand = torch.rand_like(selected_target_probs)
+        else:
+            uniform_rand = torch.empty_like(selected_target_probs)
+
+            for idx in seed_indices:
+                uniform_rand[idx, :] = torch.rand(1,
+                                                  k,
+                                                  dtype=self.probs_dtype,
+                                                  device=target_probs.device,
+                                                  generator=generators[idx])
+
+            if non_seed_indices:
+                uniform_rand[non_seed_indices, :] = torch.rand(
+                    len(non_seed_indices),
+                    k,
+                    dtype=self.probs_dtype,
+                    device=target_probs.device)
+
         capped_ratio = torch.minimum(
             selected_target_probs / selected_draft_probs,
             torch.full((1, ), 1, device=target_probs.device))
@@ -240,6 +272,27 @@ def _smallest_positive_value(self) -> float:
         """
         return torch.finfo(self.probs_dtype).tiny
 
+    # partition batch into indices for which a generator is provided
+    # and indicies for which no generator is provided
+    @staticmethod
+    def _split_batch_by_seeded(
+        generators: List[Optional[torch.Generator]],
+        k: int = 1,
+    ) -> Tuple[List[int], Optional[List[int]]]:
+
+        if all(generator is None for generator in generators):
+            seed_indices: List[int] = []
+            non_seed_indices: Optional[List[int]] = None
+        else:
+            seed_indices, non_seed_indices = [], []
+            for i, generator in enumerate(generators):
+                if generator is None:
+                    non_seed_indices.extend(range(k * i, k * (i + 1)))
+                else:
+                    seed_indices.extend(range(k * i, k * (i + 1)))
+
+        return seed_indices, non_seed_indices
+
 
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead that skips the sync.
@@ -250,12 +303,25 @@ def _smallest_positive_value(self) -> float:
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
+    k: int,
+    generators: List[Optional[torch.Generator]],
+    seed_indices: List[int],
+    non_seed_indices: List[int],
 ) -> torch.Tensor:
+
     if num_samples > 1:
         # This is equivalent to torch.repeat_interleaved (which also
         # forces a GPU<->CPU sync).
         probs = probs[:, None, :].expand(probs.shape[0], num_samples,
                                          probs.shape[1]).contiguous().view(
                                              -1, probs.shape[1])
-    q = torch.empty_like(probs).exponential_(1.0)
+
+    q = torch.empty_like(probs)
+    if len(seed_indices) == 0:
+        q.exponential_(1.0)
+    else:
+        q[non_seed_indices].exponential_(1.0)
+        for idx in seed_indices:
+            q[idx].exponential_(1.0, generator=generators[idx // k])
+
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index a0b19046b7491..60ba4623edc38 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -610,6 +610,159 @@ def forward(
         return query.flatten(-2), key.flatten(-2)
 
 
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
+                                self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                                self.rotary_dim, self.base,
+                                                self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         device="cuda",
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        print("Cache shape", cache.shape)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+            positions.device)
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+
+class GemmaRotaryEmbedding(RotaryEmbedding):
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
+        inv_freq = 1.0 / (base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float() /
+            self.rotary_dim))
+        return inv_freq
+
+
+class ExtendedRotaryEmbedding(RotaryEmbedding):
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        return self.apply_scaling(inv_freqs)
+
+    def apply_scaling(self, freqs: torch.Tensor):
+        scale_factor = 8
+        low_freq_factor = 1
+        high_freq_factor = 4
+        old_context_len = 8192
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / scale_factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (old_context_len / wavelen - low_freq_factor) / (
+                    high_freq_factor - low_freq_factor)
+                new_freqs.append((1 - smooth) * freq / scale_factor +
+                                 smooth * freq)
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
@@ -641,12 +794,17 @@ def get_rope(
         rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                                      is_neox_style, dtype)
     else:
-        scaling_type = rope_scaling["type"]
+        scaling_type = rope_scaling[
+            "type"] if "type" in rope_scaling else rope_scaling["rope_type"]
         # The correct one should be "longrope" but keep "su" here
         # for backward compatible
-        if scaling_type != "su" and scaling_type != "longrope":
+        if scaling_type not in {"su", "longrope", "llama3"}:
             scaling_factor = rope_scaling["factor"]
-        if scaling_type == "linear":
+        if scaling_type == "llama3":
+            rotary_emb = ExtendedRotaryEmbedding(head_size, rotary_dim,
+                                                 max_position, base,
+                                                 is_neox_style, dtype)
+        elif scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
                                                       max_position, base,
                                                       is_neox_style,
@@ -669,6 +827,19 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor, dtype,
                                                     **extra_kwargs)
+        elif scaling_type == "deepseek_yarn":
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow", "mscale", "mscale_all_dim")
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size, rotary_dim, original_max_position, base,
+                is_neox_style, scaling_factor, dtype, **extra_kwargs)
         # The correct one should be "longrope" but keep "su" here
         # for backward compatible
         elif scaling_type == "su" or scaling_type == "longrope":
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e07360a2fd682..5c376797a054f 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -47,6 +47,32 @@ def __init__(self):
         # speculative decoding.
         self.include_gpu_probs_tensor = False
 
+    def _init_sampling_tensors(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ):
+        """The goal here is to reuse sampling tensors between similar decode
+        runs. This is possible because sampling logic does not change between
+        decodes of the same sequences.
+        """
+        _, vocab_size = logits.shape
+
+        # First free any existing stored sampling tensors.
+        # This is necessary because some sampling tensors may
+        # have pinned memory.
+        self._sampling_tensors = None
+
+        # Initialize new sampling tensors
+        (sampling_tensors, do_penalties, do_top_p_top_k,
+         do_min_p) = SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype)
+
+        self._sampling_tensors = sampling_tensors
+        self._do_penalties = do_penalties
+        self._do_top_p_top_k = do_top_p_top_k
+        self._do_min_p = do_min_p
+
     def forward(
         self,
         logits: torch.Tensor,
@@ -60,12 +86,23 @@ def forward(
         assert logits is not None
         _, vocab_size = logits.shape
 
-        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
-
         # Prepare sampling tensors with pinned memory to avoid blocking.
-        (sampling_tensors, do_penalties, do_top_p_top_k,
-         do_min_p) = SamplingTensors.from_sampling_metadata(
-             sampling_metadata, vocab_size, logits.device, logits.dtype)
+        if not sampling_metadata.reuse_sampling_tensors:
+            self._init_sampling_tensors(logits, sampling_metadata)
+        elif self._do_penalties:
+            # In this case, the sampling tensors logic depends on
+            # "output_tokens" of a sequence. As a result, we cannot
+            # reuse sampling tensors, since "output_tokens" changes
+            # between decode runs.
+            self._init_sampling_tensors(logits, sampling_metadata)
+
+        assert self._sampling_tensors is not None
+        sampling_tensors = self._sampling_tensors
+        do_penalties = self._do_penalties
+        do_top_p_top_k = self._do_top_p_top_k
+        do_min_p = self._do_min_p
+
+        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
 
         # Apply presence and frequency penalties.
         if do_penalties:
@@ -77,7 +114,7 @@ def forward(
 
         # Apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
-        logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
+        logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
         if do_top_p_top_k:
             logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
@@ -109,13 +146,19 @@ def forward(
             on_device_tensors = None
 
         # Get the logprobs query results.
-        prompt_logprobs, sample_logprobs = _get_logprobs(
-            logprobs, sampling_metadata, sample_results)
-        return _build_sampler_output(sample_results,
-                                     sampling_metadata,
-                                     prompt_logprobs,
-                                     sample_logprobs,
-                                     on_device_tensors=on_device_tensors)
+        prompt_logprobs = None
+        sample_logprobs = None
+        if not sampling_metadata.skip_sampler_cpu_output:
+            prompt_logprobs, sample_logprobs = _get_logprobs(
+                logprobs, sampling_metadata, sample_results)
+
+        return _build_sampler_output(
+            sample_results,
+            sampling_metadata,
+            prompt_logprobs,
+            sample_logprobs,
+            on_device_tensors=on_device_tensors,
+            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
 
     @property
     def _should_modify_greedy_probs_inplace(self) -> bool:
@@ -535,24 +578,29 @@ def _sample_with_torch(
 
     # GPU<->CPU sync happens in the loop below.
     # This also converts the sample output to Python objects.
-    for sampling_type in SamplingType:
-        if sampling_type not in sample_metadata:
-            continue
-        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
-        if sampling_type == SamplingType.GREEDY:
-            sample_results = _greedy_sample(seq_groups, greedy_samples)
-        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
-            sample_results = _random_sample(seq_groups,
-                                            multinomial_samples[sampling_type])
-        elif sampling_type == SamplingType.BEAM:
-            sample_results = _beam_search_sample(seq_groups,
-                                                 beam_search_logprobs)
-        sample_results_dict.update(zip(seq_group_id, sample_results))
+    if not sampling_metadata.skip_sampler_cpu_output:
+        for sampling_type in SamplingType:
+            if sampling_type not in sample_metadata:
+                continue
+            (seq_group_id, seq_groups) = sample_metadata[sampling_type]
+            if sampling_type == SamplingType.GREEDY:
+                sample_results = _greedy_sample(seq_groups, greedy_samples)
+            elif sampling_type in (SamplingType.RANDOM,
+                                   SamplingType.RANDOM_SEED):
+                sample_results = _random_sample(
+                    seq_groups, multinomial_samples[sampling_type])
+            elif sampling_type == SamplingType.BEAM:
+                sample_results = _beam_search_sample(seq_groups,
+                                                     beam_search_logprobs)
+            sample_results_dict.update(zip(seq_group_id, sample_results))
+
+        sample_results = [
+            sample_results_dict.get(i, ([], []))
+            for i in range(len(sampling_metadata.seq_groups))
+        ]
+    else:
+        sample_results = []
 
-    sample_results = [
-        sample_results_dict.get(i, ([], []))
-        for i in range(len(sampling_metadata.seq_groups))
-    ]
     return sample_results, sampled_token_ids_tensor
 
 
@@ -679,7 +727,7 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
 
     Returns:
         torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
-                    Each element in the returned tensor represents the rank 
+                    Each element in the returned tensor represents the rank
                     of the chosen token in the input logprob tensor.
     """
     vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
@@ -965,7 +1013,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
                 distribution.
             - Greedy sampling performs `argmax` to obtain the token with the
                 highest likelihood.
-    
+
     Ignoring greedy sampling for a moment, we find that the computed probability
     distribution has the following property: we can sample from it independently
     and find that the token sampled by the Sampler has a frequency corresponding
@@ -997,10 +1045,11 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
 def _build_sampler_output(
     sample_results: SampleResultType,
     sampling_metadata: SamplingMetadata,
-    prompt_logprobs: List[Optional[PromptLogprobs]],
-    sample_logprobs: List[SampleLogprobs],
+    prompt_logprobs: Optional[List[Optional[PromptLogprobs]]],
+    sample_logprobs: Optional[List[SampleLogprobs]],
     on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor,
                                       torch.Tensor]],
+    skip_sampler_cpu_output: bool = False,
 ) -> SamplerOutput:
     """Construct Python objects with the output of sampling.
 
@@ -1010,22 +1059,26 @@ def _build_sampler_output(
             allows post-processing without copies to CPU/serialization, e.g. in
             speculative decoding rejection sampling.
     """
-
     sampler_output: List[CompletionSequenceGroupOutput] = []
-    for (seq_group, sample_result, group_prompt_logprobs,
-         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
-                                       sample_results, prompt_logprobs,
-                                       sample_logprobs):
-        seq_ids = seq_group.seq_ids
-        next_token_ids, parent_ids = sample_result
-        seq_outputs: List[SequenceOutput] = []
-        for parent_id, next_token_id, logprobs in zip(parent_ids,
-                                                      next_token_ids,
-                                                      group_sample_logprobs):
-            seq_outputs.append(
-                SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
-        sampler_output.append(
-            CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs))
+    if not skip_sampler_cpu_output:
+        assert prompt_logprobs is not None
+        assert sample_logprobs is not None
+
+        for (seq_group, sample_result, group_prompt_logprobs,
+             group_sample_logprobs) in zip(sampling_metadata.seq_groups,
+                                           sample_results, prompt_logprobs,
+                                           sample_logprobs):
+            seq_ids = seq_group.seq_ids
+            next_token_ids, parent_ids = sample_result
+            seq_outputs: List[SequenceOutput] = []
+            for parent_id, next_token_id, logprobs in zip(
+                    parent_ids, next_token_ids, group_sample_logprobs):
+                seq_outputs.append(
+                    SequenceOutput(seq_ids[parent_id], next_token_id,
+                                   logprobs))
+            sampler_output.append(
+                CompletionSequenceGroupOutput(seq_outputs,
+                                              group_prompt_logprobs))
 
     # If not specified, store None values in SamplerOutput.
     if on_device_tensors is not None:
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 9856a7e7ddea0..08191da49d52f 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,9 +1,12 @@
-from typing import Optional
+from abc import abstractmethod
+from typing import List, Optional
 
 import torch
+import torch.jit
+import torch.nn as nn
 
 
-class SpecDecodeBaseSampler():
+class SpecDecodeBaseSampler(nn.Module):
     """Base class for samplers used for Speculative Decoding verification
         step.
     """
@@ -204,3 +207,36 @@ def _raise_if_out_of_bounds_vocab(
         assert torch.all(bonus_token_ids >= 0)
         assert torch.all(draft_token_ids < vocab_size)
         assert torch.all(draft_token_ids >= 0)
+
+
+class SpecDecodeDeterministicBaseSampler(SpecDecodeBaseSampler):
+    """Base class for samplers used for Speculative Decoding verification
+       step which are deterministic.
+    """
+
+    @abstractmethod
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class SpecDecodeStochasticBaseSampler(SpecDecodeBaseSampler):
+    """Base class for samplers used for Speculative Decoding verification
+       step which are stochastic
+    """
+
+    @abstractmethod
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        generators: List[Optional[torch.Generator]],
+    ) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index f12d6a03b4d16..a87ea0eee57de 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,12 +1,11 @@
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
-    SpecDecodeBaseSampler)
+    SpecDecodeDeterministicBaseSampler)
 
 
-class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
+class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
     """Apply typical acceptance sampling as described in section 3.3.1 in 
         "MEDUSA: Simple LLM Inference Acceleration Framework with 
         Multiple Decoding Heads"
@@ -15,10 +14,10 @@ class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
 
     def __init__(
         self,
+        posterior_threshold: float,
+        posterior_alpha: float,
         disable_bonus_tokens: bool = False,
         strict_mode: bool = False,
-        posterior_threshold: float = 0.09,
-        posterior_alpha: float = 0.3,
     ):
         """Create a Typical Acceptance Sampler.
 
@@ -31,23 +30,20 @@ def __init__(
             nontrivial latency.
             posterior_threshold : A threshold value that sets a lower bound 
             on the posterior probability of a token in target model for it
-            to be accepted. Default is 0.09
+            to be accepted.
             posterior_alpha : A scaling factor for the entropy-based
-            threshold in typical acceptance sampling. Typically defaults to
-            sqrt of posterior_threshold and is set to 0.3.
+            threshold in typical acceptance sampling.
         """
-        SpecDecodeBaseSampler.__init__(
-            self,
-            disable_bonus_tokens=disable_bonus_tokens,
-            strict_mode=strict_mode)
-        nn.Module.__init__(self)
         self._posterior_threshold = posterior_threshold
         self._posterior_alpha = posterior_alpha
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
         target_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
     ) -> torch.Tensor:
         """Sample token ids using typical acceptance sampling. This accepts 
@@ -69,6 +65,8 @@ def forward(
                 speculative tokens in a sequence are accepted.
             shape = [batch_size, num_bonus_tokens]
 
+            draft_probs: This parameter is unused by the acceptance sampler.
+
             draft_token_ids: The token ids that were sampled from the draft
                 probabilities.
             shape = [batch_size, num_speculative_tokens]
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 1a26c5c63fedc..74aeb964274b0 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -8,6 +8,9 @@
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -157,6 +160,8 @@ class VocabParallelEmbedding(torch.nn.Module):
         params_dtype: type of the parameters.
         org_num_embeddings: original vocabulary size (without LoRA).
         padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
     """  # noqa: E501
 
     def __init__(self,
@@ -164,7 +169,9 @@ def __init__(self,
                  embedding_dim: int,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
         # Keep the input dimensions.
@@ -187,6 +194,14 @@ def __init__(self,
                                                self.org_vocab_size, tp_rank,
                                                self.tp_size)
         self.embedding_dim = embedding_dim
+
+        linear_method = None
+        if quant_config is not None:
+            linear_method = quant_config.get_quant_method(self, prefix=prefix)
+        if linear_method is None:
+            linear_method = UnquantizedLinearMethod()
+        self.linear_method: QuantizeMethodBase = linear_method
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         # Divide the weight matrix along the vocaburaly dimension.
@@ -201,14 +216,14 @@ def __init__(self,
         self.num_added_embeddings_per_partition = (
             self.shard_indices.added_vocab_end_index -
             self.shard_indices.added_vocab_start_index)
-        self.weight = Parameter(
-            torch.empty(self.num_embeddings_per_partition,
-                        self.embedding_dim,
-                        dtype=params_dtype))
-        set_weight_attrs(self.weight, {
-            "parallel_dim": 0,
-            "weight_loader": self.weight_loader
-        })
+
+        self.linear_method.create_weights(self,
+                                          self.embedding_dim,
+                                          [self.num_embeddings_per_partition],
+                                          self.embedding_dim,
+                                          self.num_embeddings_padded,
+                                          params_dtype=params_dtype,
+                                          weight_loader=self.weight_loader)
 
     @classmethod
     def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
@@ -288,10 +303,32 @@ def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
         return ret
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        parallel_dim = param.parallel_dim
-        assert loaded_weight.shape[parallel_dim] == self.org_vocab_size
-        loaded_weight = loaded_weight[self.shard_indices.org_vocab_start_index:
-                                      self.shard_indices.org_vocab_end_index]
+        output_dim = getattr(param, "output_dim", None)
+        packed_dim = getattr(param, "packed_dim", None)
+
+        # If parameter does not have output dim, then it should
+        # be copied onto all gpus (e.g. g_idx for act_order gptq).
+        if output_dim is None:
+            assert param.data.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+            return
+
+        # Shard indexes for loading the weight
+        start_idx = self.shard_indices.org_vocab_start_index
+        shard_size = self.shard_indices.org_vocab_end_index - start_idx
+
+        # If param packed on the same dim we are sharding on, then
+        # need to adjust offsets of loaded weight by pack_factor.
+        if packed_dim is not None and packed_dim == output_dim:
+            assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
+                                                       param.pack_factor)
+            start_idx = start_idx // param.pack_factor
+            shard_size = shard_size // param.pack_factor
+        else:
+            assert loaded_weight.shape[output_dim] == self.org_vocab_size
+
+        # Copy the data.
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
         param[loaded_weight.shape[0]:].data.fill_(0)
 
@@ -306,11 +343,11 @@ def forward(self, input_):
                 self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
-            # Get the embeddings.
+        # Get the embeddings.
         output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
@@ -346,16 +383,19 @@ def __init__(self,
                  bias: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__(num_embeddings, embedding_dim, params_dtype,
-                         org_num_embeddings, padding_size)
+                         org_num_embeddings, padding_size, quant_config,
+                         prefix)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition,
                             dtype=params_dtype))
             set_weight_attrs(self.bias, {
-                "parallel_dim": 0,
-                "weight_loader": self.weight_loader
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
             })
         else:
             self.register_parameter("bias", None)
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index e3e32d61ab04d..d10107a7f024e 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -3,8 +3,8 @@
 from torch import nn
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.model_executor.model_loader.loader import (BaseModelLoader,
                                                      get_model_loader)
 from vllm.model_executor.model_loader.utils import (
@@ -15,13 +15,13 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
               device_config: DeviceConfig, parallel_config: ParallelConfig,
               scheduler_config: SchedulerConfig,
               lora_config: Optional[LoRAConfig],
-              vision_language_config: Optional[VisionLanguageConfig],
+              multimodal_config: Optional[MultiModalConfig],
               cache_config: CacheConfig) -> nn.Module:
     loader = get_model_loader(load_config)
     return loader.load_model(model_config=model_config,
                              device_config=device_config,
                              lora_config=lora_config,
-                             vision_language_config=vision_language_config,
+                             multimodal_config=multimodal_config,
                              parallel_config=parallel_config,
                              scheduler_config=scheduler_config,
                              cache_config=cache_config)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d3babcf9c3451..88f16918b0119 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -16,8 +16,8 @@
 from torch import nn
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+                         LoRAConfig, ModelConfig, MultiModalConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -32,8 +32,11 @@
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.models.interfaces import (has_inner_state,
+                                                   supports_lora,
+                                                   supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import is_tpu
 
 logger = init_logger(__name__)
@@ -45,7 +48,7 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability = torch.cuda.get_device_capability()
+        capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
         if capability < quant_config.get_min_capability():
             raise ValueError(
@@ -64,12 +67,15 @@ def _get_quantization_config(
 
 
 def _get_model_initialization_kwargs(
-        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig]
-) -> Dict[str, Any]:
+        model_class: Type[nn.Module],
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        scheduler_config: Optional[SchedulerConfig] = None) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
     extra_kwargs: Dict[str, Any] = {}
-    if hasattr(model_class, "supported_lora_modules"):
+
+    if supports_lora(model_class):
+        # lora_config=None is used to disable LoRA
         extra_kwargs["lora_config"] = lora_config
     elif lora_config:
         raise ValueError(
@@ -77,20 +83,27 @@ def _get_model_initialization_kwargs(
             "but LoRA is enabled. Support for this model may "
             "be added in the future. If this is important to you, "
             "please open an issue on github.")
-    elif issubclass(model_class, VisionLanguageModelBase):
-        if vision_language_config is None:
-            raise ValueError("Provide `image_input_type` and other vision "
-                             "related configurations through LLM entrypoint "
-                             "or engine arguments.")
 
-        extra_kwargs["vision_language_config"] = vision_language_config
+    if supports_vision(model_class):
+        if multimodal_config is None:
+            raise ValueError("Provide vision related configurations "
+                             "through LLM entrypoint or engine arguments.")
+
+        extra_kwargs["multimodal_config"] = multimodal_config
+
+    if has_inner_state(model_class) and scheduler_config:
+        extra_kwargs["scheduler_config"] = scheduler_config
+
     return extra_kwargs
 
 
-def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
-                      lora_config: Optional[LoRAConfig],
-                      vision_language_config: Optional[VisionLanguageConfig],
-                      cache_config: CacheConfig) -> nn.Module:
+def _initialize_model(
+        model_config: ModelConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        cache_config: CacheConfig,
+        scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_class = get_model_architecture(model_config)[0]
     quant_config = _get_quantization_config(model_config, load_config)
@@ -99,7 +112,8 @@ def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
                        cache_config=cache_config,
                        quant_config=quant_config,
                        **_get_model_initialization_kwargs(
-                           model_class, lora_config, vision_language_config))
+                           model_class, lora_config, multimodal_config,
+                           scheduler_config))
 
 
 class BaseModelLoader(ABC):
@@ -112,7 +126,7 @@ def __init__(self, load_config: LoadConfig):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -147,6 +161,7 @@ def _maybe_download_from_modelscope(
                     cache_dir=self.load_config.download_dir,
                     local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                     revision=revision,
+                    ignore_patterns=self.load_config.ignore_patterns,
                 )
             else:
                 model_path = model
@@ -182,9 +197,13 @@ def _prepare_weights(self, model_name_or_path: str,
             allow_patterns += ["*.pt"]
 
         if not is_local:
-            hf_folder = download_weights_from_hf(model_name_or_path,
-                                                 self.load_config.download_dir,
-                                                 allow_patterns, revision)
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
         else:
             hf_folder = model_name_or_path
 
@@ -252,15 +271,15 @@ def _xla_weights_iterator(iterator: Generator):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
-                                          cache_config)
+                                          lora_config, multimodal_config,
+                                          cache_config, scheduler_config)
             model.load_weights(
                 self._get_weights_iterator(model_config.model,
                                            model_config.revision,
@@ -273,10 +292,6 @@ def load_model(self, *, model_config: ModelConfig,
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
                     quant_method.process_weights_after_loading(module)
-                # FIXME: Remove this after Mixtral is updated
-                # to use quant_method.
-                if hasattr(module, "process_weights_after_loading"):
-                    module.process_weights_after_loading()
         return model.eval()
 
 
@@ -292,15 +307,15 @@ def __init__(self, load_config: LoadConfig):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
-                                          cache_config)
+                                          lora_config, multimodal_config,
+                                          cache_config, scheduler_config)
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
@@ -333,7 +348,7 @@ def _load_model_serialized_cpu(
         model_config: ModelConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         cache_config: CacheConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer to the CPU.
@@ -346,7 +361,7 @@ def _load_model_serialized_cpu(
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
 
             model.load_weights(self._get_weights_iterator())
@@ -357,7 +372,7 @@ def _load_model_serialized(
         model_config: ModelConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         cache_config: CacheConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer.
@@ -371,7 +386,7 @@ def _load_model_serialized(
                 quant_config = _get_quantization_config(
                     model_config, self.load_config)
                 extra_kwargs = _get_model_initialization_kwargs(
-                    model_class, lora_config, vision_language_config)
+                    model_class, lora_config, multimodal_config)
                 extra_kwargs["quant_config"] = quant_config
                 extra_kwargs["cache_config"] = cache_config
 
@@ -386,7 +401,7 @@ def _load_model_serialized(
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -400,12 +415,10 @@ def load_model(self, *, model_config: ModelConfig,
 
         if is_vllm_tensorized(self.tensorizer_config):
             return self._load_model_serialized(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                                cache_config)
         return self._load_model_serialized_cpu(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                                cache_config)
 
     @staticmethod
@@ -481,14 +494,18 @@ def _prepare_weights(self, model_name_or_path: str,
             return model_name_or_path
         else:
             allow_patterns = ["*.safetensors"]
-            return download_weights_from_hf(model_name_or_path,
-                                            self.load_config.download_dir,
-                                            allow_patterns, revision)
+            return download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
 
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -502,7 +519,7 @@ def load_model(self, *, model_config: ModelConfig,
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
@@ -655,8 +672,12 @@ def _get_weight_files(
                 matching_files = fnmatch.filter(repo_files, pattern)
                 if matching_files:
                     hf_folder = download_weights_from_hf(
-                        model_name_or_path, self.load_config.download_dir,
-                        [pattern], revision)
+                        model_name_or_path,
+                        self.load_config.download_dir,
+                        [pattern],
+                        revision,
+                        ignore_patterns=self.load_config.ignore_patterns,
+                    )
                     return glob.glob(os.path.join(hf_folder, pattern)), pattern
 
         raise RuntimeError(
@@ -798,14 +819,14 @@ def _load_weights(self, model_config: ModelConfig,
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
 
                 self._load_weights(model_config, model)
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
new file mode 100644
index 0000000000000..5c522a61732a4
--- /dev/null
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -0,0 +1,210 @@
+# ruff: noqa: SIM117
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import openvino as ov
+import torch
+from huggingface_hub import HfApi
+from openvino._offline_transformations import paged_attention_transformation
+from optimum.intel import OVModelForCausalLM
+from torch import nn
+
+import vllm.envs as envs
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import DeviceConfig, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
+                                                         _prune_hidden_states)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+logger = init_logger(__name__)
+
+
+def _flattenize_inputs(inputs):
+    """
+    Helper function for making nested inputs flattens
+    """
+    flatten_inputs = []
+    for input_data in inputs:
+        if input_data is None:
+            continue
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(_flattenize_inputs(input_data))
+        elif isinstance(input_data, dict):
+            flatten_inputs.extend(_flattenize_inputs(list(
+                input_data.values())))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
+                             is_cpu: bool):
+    # Apply hardware dependent modifications to KV tensors
+    for parameter in model.get_parameters():
+        input = parameter.get_output_tensor(0)
+        input_names = input.get_names()
+        if len(input_names) != 1:
+            continue
+        input_name = next(iter(input_names))
+        shape = parameter.get_partial_shape()
+        # use real block size if available, just a placeholder
+        # to provide the expected rank
+        x_size = 1
+        num_blocks = ov.Dimension()
+        block_size = ov.Dimension()
+        head_size = ov.Dimension()
+        # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
+        # pass more parameters to this function to set more static dimensions
+        if input_name.startswith("key_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [
+                num_blocks,
+                shape[1],
+                shape[2].get_length() //
+                x_size if shape[2].is_static else ov.Dimension(),
+                block_size,
+                x_size,
+            ]
+        elif input_name.startswith("value_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
+        else:
+            continue
+        parameter.set_partial_shape(
+            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
+        parameter.set_element_type(kv_cache_dtype)
+    model.validate_nodes_and_infer_types()
+
+
+def _require_model_export(model_id, revision=None, subfolder=None):
+    model_dir = Path(model_id)
+    if subfolder is not None:
+        model_dir = model_dir / subfolder
+    if model_dir.is_dir():
+        return (not (model_dir / "openvino_model.xml").exists()
+                or not (model_dir / "openvino_model.bin").exists())
+
+    hf_api = HfApi()
+    try:
+        model_info = hf_api.model_info(model_id, revision=revision or "main")
+        normalized_subfolder = (None if subfolder is None else
+                                Path(subfolder).as_posix())
+        model_files = [
+            file.rfilename for file in model_info.siblings
+            if normalized_subfolder is None
+            or file.rfilename.startswith(normalized_subfolder)
+        ]
+        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
+                         else f"{normalized_subfolder}/openvino_model.xml")
+        return (ov_model_path not in model_files
+                or ov_model_path.replace(".xml", ".bin") not in model_files)
+    except Exception:
+        return True
+
+
+class OpenVINOCasualLM(nn.Module):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        kv_cache_dtype: ov.Type,
+    ) -> None:
+        super().__init__()
+        self.logits_processor = LogitsProcessor(
+            model_config.hf_config.vocab_size, logits_as_input=True)
+        self.sampler = Sampler()
+
+        export = _require_model_export(model_config.model)
+        if export:
+            logger.warning(
+                f"Provided model id {model_config.model} does not "  # noqa: G004
+                "contain OpenVINO IR, the model will be converted to IR with "
+                "default options. If you need to use specific options for "
+                "model conversion, use optimum-cli export openvino with "
+                "desired options.")
+        else:
+            logger.warning(
+                "OpenVINO IR is available for provided model id "  # noqa: G004
+                f"{model_config.model}. This IR will be used for inference "
+                "as-is, all possible options that may affect model conversion "
+                "are ignored.")
+
+        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+        pt_model = OVModelForCausalLM.from_pretrained(
+            model_config.model,
+            export=export,
+            compile=False,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
+        paged_attention_transformation(pt_model.model)
+        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
+                                 device_config.device.type == "cpu")
+
+        core = ov.Core()
+        ov_compiled = core.compile_model(pt_model.model, "CPU")
+        self.ov_request = ov_compiled.create_infer_request()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        attn_metadata: OpenVINOAttentionMetadata,
+    ) -> torch.Tensor:
+        flatten_kv_cache = _flattenize_inputs(kv_caches)
+
+        inputs = [
+            input_ids,
+            positions,
+            *flatten_kv_cache,
+            attn_metadata.past_lens,
+            attn_metadata.subsequence_begins,
+            attn_metadata.block_indices,
+            attn_metadata.block_indices_begins,
+            attn_metadata.max_context_len,
+        ]
+
+        self.ov_request.start_async(inputs, share_inputs=True)
+        self.ov_request.wait()
+
+        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
+
+        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
+        return logits.view(-1, logits.shape[-1])
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+
+def get_model(
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+    kv_cache_dtype: ov.Type,
+    **kwargs,
+) -> torch.nn.Module:
+    lora_config = kwargs.get("lora_config", None)
+    if lora_config:
+        raise ValueError(
+            "OpenVINO modeling does not support LoRA, "
+            "but LoRA is enabled. Support for this model may "
+            "be added in the future. If this is important to you, "
+            "please open an issue on github.")
+
+    return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 943022a3f03c7..dbba6ea358346 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -6,7 +6,7 @@
 import os
 import tempfile
 from collections import defaultdict
-from typing import Any, Generator, Iterable, List, Optional, Tuple
+from typing import Any, Generator, Iterable, List, Optional, Tuple, Union
 
 import filelock
 import huggingface_hub.constants
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -188,6 +189,7 @@ def download_weights_from_hf(
     cache_dir: Optional[str],
     allow_patterns: List[str],
     revision: Optional[str] = None,
+    ignore_patterns: Optional[Union[str, List[str]]] = None,
 ) -> str:
     """Download model weights from Hugging Face Hub.
 
@@ -199,6 +201,9 @@ def download_weights_from_hf(
             weight files. Files matched by any of the patterns will be
             downloaded.
         revision (Optional[str]): The revision of the model.
+        ignore_patterns (Optional[Union[str, List[str]]]): The patterns to
+            filter out the weight files. Files matched by any of the patterns
+            will be ignored.
 
     Returns:
         str: The path to the downloaded model weights.
@@ -222,6 +227,7 @@ def download_weights_from_hf(
         hf_folder = snapshot_download(
             model_name_or_path,
             allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
             cache_dir=cache_dir,
             tqdm_class=DisabledTqdm,
             revision=revision,
@@ -312,6 +318,13 @@ def filter_files_not_needed_for_inference(
     return hf_weights_files
 
 
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
 def np_cache_weights_iterator(
     model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
     hf_weights_files: List[str]
@@ -320,6 +333,8 @@ def np_cache_weights_iterator(
 
     Will dump the model weights to numpy files if they are not already dumped.
     """
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
     # Convert the model weights from torch tensors to numpy arrays for
     # faster loading.
     np_folder = os.path.join(hf_folder, "np")
@@ -330,7 +345,12 @@ def np_cache_weights_iterator(
     with get_lock(model_name_or_path, cache_dir):
         if not os.path.exists(weight_names_file):
             weight_names: List[str] = []
-            for bin_file in hf_weights_files:
+            for bin_file in tqdm(
+                    hf_weights_files,
+                    desc="Loading np_cache checkpoint shards",
+                    disable=not enable_tqdm,
+                    bar_format=_BAR_FORMAT,
+            ):
                 state = torch.load(bin_file, map_location="cpu")
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
@@ -354,7 +374,14 @@ def safetensors_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    for st_file in hf_weights_files:
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
+    for st_file in tqdm(
+            hf_weights_files,
+            desc="Loading safetensors checkpoint shards",
+            disable=not enable_tqdm,
+            bar_format=_BAR_FORMAT,
+    ):
         with safe_open(st_file, framework="pt") as f:
             for name in f.keys():  # noqa: SIM118
                 param = f.get_tensor(name)
@@ -365,7 +392,14 @@ def pt_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
-    for bin_file in hf_weights_files:
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
+    for bin_file in tqdm(
+            hf_weights_files,
+            desc="Loading pt checkpoint shards",
+            disable=not enable_tqdm,
+            bar_format=_BAR_FORMAT,
+    ):
         state = torch.load(bin_file, map_location="cpu")
         for name, param in state.items():
             yield name, param
@@ -439,6 +473,7 @@ def initialize_dummy_weights(
     model: torch.nn.Module,
     low: float = -1e-3,
     high: float = 1e-3,
+    seed: int = 1234,
 ) -> None:
     """Initialize model weights with random values.
 
@@ -446,14 +481,74 @@ def initialize_dummy_weights(
     measurements. Additionally, the model weights should not cause NaNs in the
     forward pass. We empirically found that initializing the weights with
     values between -1e-3 and 1e-3 works well for most models.
+
+    We use per-parameter random seed, so that dummy weights are consistent,
+    even if the model is partitioned across multiple devices. When the seed
+    is fixed, the random values generated by this function only depends on
+    the parameter's number of elements and its data type.
     """
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
             if torch.finfo(param.data.dtype).bits < 16:
                 # uniform_ doesn't support < 16-bit datatypes (FP8)
                 dtype = param.data.dtype
                 tmp_param = param.data.to(torch.float16)
-                tmp_param = tmp_param.uniform_(low, high).to(dtype)
+                tmp_param = tmp_param.uniform_(low, high,
+                                               generator=generator).to(dtype)
                 param.data.copy_(tmp_param)
             else:
-                param.uniform_(low, high)
+                param.uniform_(low, high, generator=generator)
+
+
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+    """Remap the name of FP8 k/v_scale parameters.
+
+    This function handles the remapping of FP8 k/v_scale parameter names.
+    It detects if the given name ends with a suffix and attempts to remap
+    it to the expected name format in the model. If the remapped name is not
+    found in the params_dict, a warning is printed and None is returned.
+
+    Args:
+        name (str): The original loaded checkpoint parameter name.
+        params_dict (dict): Dictionary containing the model's named parameters.
+
+    Returns:
+        str: The remapped parameter name if successful, or the original name
+             if no remapping is needed.
+        None: If the remapped name is not found in params_dict.
+    """
+    if name.endswith(".kv_scale"):
+        print_warning_once(
+            "DEPRECATED. Found kv_scale in the checkpoint. "
+            "This format is deprecated in favor of separate k_scale and "
+            "v_scale tensors and will be removed in a future release. "
+            "Functionally, we will remap kv_scale to k_scale and duplicate "
+            "k_scale to v_scale")
+        # NOTE: we remap the deprecated kv_scale to k_scale
+        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
+        if remapped_name not in params_dict:
+            print_warning_once(
+                f"Found kv_scale in the checkpoint (e.g. {name}), "
+                "but not found the expected name in the model "
+                f"(e.g. {remapped_name}). kv_scale is "
+                "not loaded.")
+            return None
+        return remapped_name
+
+    possible_scale_names = [".k_scale", ".v_scale"]
+    for scale_name in possible_scale_names:
+        if name.endswith(scale_name):
+            remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if remapped_name not in params_dict:
+                print_warning_once(
+                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
+                    "but not found the expected name in the model "
+                    f"(e.g. {remapped_name}). {scale_name} is "
+                    "not loaded.")
+                return None
+            return remapped_name
+
+    # If there were no matches, return the untouched param name
+    return name
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
old mode 100755
new mode 100644
index 5afb2e1d44d39..31370aebba599
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,3 +1,4 @@
+import functools
 import importlib
 from typing import Dict, List, Optional, Type
 
@@ -15,14 +16,21 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
+    #TODO(ywang96): remove this when huggingface fixes the model repo
+    "ChameleonForCausalLM": ("chameleon", "ChameleonForConditionalGeneration"),
+    "ChameleonForConditionalGeneration":
+    ("chameleon", "ChameleonForConditionalGeneration"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
+    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
@@ -47,6 +55,9 @@
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
+    "PaliGemmaForConditionalGeneration":
+    ("paligemma", "PaliGemmaForConditionalGeneration"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
@@ -60,7 +71,9 @@
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
+    "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
+    "JambaForCausalLM": ("jamba", "JambaForCausalLM")
 }
 
 _EMBEDDING_MODELS = {
@@ -78,18 +91,37 @@
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
 _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
     "Qwen2ForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
     "MistralForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
     "MixtralForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }
 
 
 class ModelRegistry:
 
+    @staticmethod
+    @functools.lru_cache(maxsize=128)
+    def _get_model(model_arch: str):
+        module_name, model_cls_name = _MODELS[model_arch]
+        module = importlib.import_module(
+            f"vllm.model_executor.models.{module_name}")
+        return getattr(module, model_cls_name, None)
+
     @staticmethod
     def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
         if model_arch in _OOT_MODELS:
@@ -106,10 +138,7 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
                     "Model architecture %s is partially supported by ROCm: %s",
                     model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
 
-        module_name, model_cls_name = _MODELS[model_arch]
-        module = importlib.import_module(
-            f"vllm.model_executor.models.{module_name}")
-        return getattr(module, model_cls_name, None)
+        return ModelRegistry._get_model(model_arch)
 
     @staticmethod
     def get_supported_archs() -> List[str]:
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 5777611079c66..49e57a847e847 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
 logger = init_logger(__name__)
@@ -412,6 +412,7 @@ def __init__(self,
         self.lm_head = ParallelLMHead(
             self.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
         )
         self.num_experts = config.num_local_experts
         self.num_experts_per_tok = config.num_experts_per_tok
@@ -426,6 +427,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -433,7 +435,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index babb92e7cdcef..e1ea8bfcac655 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -43,7 +43,9 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
@@ -292,7 +294,7 @@ def forward(
         return hidden_states
 
 
-class BaiChuanBaseForCausalLM(nn.Module):
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [
@@ -312,18 +314,23 @@ class BaiChuanBaseForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         position_embedding: str,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = BaiChuanModel(config, position_embedding, cache_config,
                                    quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -333,6 +340,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -340,7 +348,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index a29aee4cffb7d..86ae32e0cb01f 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -39,7 +39,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
@@ -276,7 +276,7 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.transformer = BloomModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.word_embeddings.weight
+        self.lm_head = self.transformer.word_embeddings
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -286,6 +286,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -293,7 +294,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
new file mode 100644
index 0000000000000..d06eb0504079f
--- /dev/null
+++ b/vllm/model_executor/models/chameleon.py
@@ -0,0 +1,1045 @@
+from functools import cached_property
+from typing import (Any, Dict, Iterable, List, Literal, Optional, Tuple,
+                    TypedDict)
+
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+from vllm.transformers_utils.configs import (ChameleonConfig,
+                                             ChameleonVQVAEConfig)
+from vllm.utils import print_warning_once
+
+from .interfaces import SupportsVision
+
+logger = init_logger(__name__)
+
+# These configs are not part of the model config but the preprocessor
+# and processor files, so we hardcode them in the model file for now.
+CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
+CHAMELEON_IMAGE_SEQ_LENGTH = 1024
+CHAMELEON_IMAGE_TOKEN_ID = 8711
+CHAMELEON_IMAGE_START_TOKEN_ID = 8197
+CHAMELEON_IMAGE_END_TOKEN_ID = 8196
+CHAMELEON_SEP_TOKEN_ID = 8710
+
+
+class ChameleonImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size, num_channels, height, width)`"""
+
+
+def get_max_chameleon_image_tokens(ctx: InputContext):
+    return CHAMELEON_IMAGE_SEQ_LENGTH
+
+
+def dummy_seq_data_for_chameleon(
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_chameleon(
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = CHAMELEON_CROP_SIZE_WIDTH
+    height = CHAMELEON_CROP_SIZE_HEIGHT
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_chameleon(ctx: InputContext, seq_len: int):
+
+    seq_data = dummy_seq_data_for_chameleon(
+        seq_len,
+        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
+    )
+
+    mm_data = dummy_image_for_chameleon()
+    return seq_data, mm_data
+
+
+def input_processor_for_chameleon(ctx: InputContext, llm_inputs: LLMInputs):
+
+    """
+    Processing input prompt to insert required tokens for image placeholder.
+
+    See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58
+    """ # noqa
+
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
+        repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH,
+        pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID,
+        pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID,
+    )
+
+    # Appending sep token for chat mode to follow default processor
+    # behavior
+    new_prompt += tokenizer.sep_token
+    new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+class ChameleonLayerNorm(nn.LayerNorm):
+
+    def __init__(self, hidden_size, *args, **kwargs):
+        super().__init__(hidden_size, *args, **kwargs)
+        self.normalized_shape = (hidden_size[-1], )
+
+    def forward(self, hidden_states):
+        hidden_states = F.layer_norm(hidden_states,
+                                     self.normalized_shape,
+                                     None,
+                                     None,
+                                     eps=1e-5)
+        hidden_states = hidden_states * self.weight + self.bias
+        return hidden_states
+
+
+# Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP
+class ChameleonMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+# Modified from vllm.model_executor.models.llama.LlamaAttention -> ChameleonAttention #noqa
+class ChameleonAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
+        self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # reshape for layernorm
+        q = q.reshape(-1, self.num_heads, self.head_dim)
+        k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q = q.view(*q.shape[:-2], -1)
+        k = k.view(*k.shape[:-2], -1)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ChameleonDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          4096)
+
+        self.self_attn = ChameleonAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            cache_config=cache_config,
+        )
+        self.mlp = ChameleonMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class ChameleonSwinDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          4096)
+
+        self.self_attn = ChameleonAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            cache_config=cache_config,
+        )
+        self.mlp = ChameleonMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEVectorQuantizer #noqa
+class ChameleonVQVAEVectorQuantizer(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+        self.num_embeddings = config.num_embeddings
+        self.embedding_dim = config.embed_dim
+        self.beta = getattr(config, "beta", 0.25)
+
+        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
+        self.re_embed = self.num_embeddings
+
+    def forward(self, hidden_state: torch.Tensor):
+        hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        distances = (
+            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) +
+            torch.sum(self.embedding.weight**2, dim=1) -
+            2 * torch.einsum("bd,dn->bn", hidden_state_flattened,
+                             self.embedding.weight.transpose(0, 1)))
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        hidden_state_quant = self.embedding(min_encoding_indices).view(
+            hidden_state.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((hidden_state_quant.detach() - hidden_state)**
+                          2) + self.beta * torch.mean(
+                              (hidden_state_quant - hidden_state.detach())**2)
+
+        # preserve gradients
+        hidden_state_quant = hidden_state + (hidden_state_quant -
+                                             hidden_state).detach()
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1,
+                                                        2).contiguous()
+
+        return hidden_state_quant, loss, min_encoding_indices
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderConvDownsample #noqa
+class ChameleonVQVAEEncoderConvDownsample(nn.Module):
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              in_channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states,
+                              pad=(0, 1, 0, 1),
+                              mode="constant",
+                              value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderResnetBlock #noqa
+class ChameleonVQVAEEncoderResnetBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonVQVAEConfig,
+        in_channels: int,
+        out_channels=None,
+        conv_shortcut=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None \
+            else out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32,
+                                        num_channels=in_channels,
+                                        eps=1e-6,
+                                        affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        self.norm2 = torch.nn.GroupNorm(num_groups=32,
+                                        num_channels=out_channels,
+                                        eps=1e-6,
+                                        affine=True)
+        self.dropout = torch.nn.Dropout(config.dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderAttnBlock #noqa
+class ChameleonVQVAEEncoderAttnBlock(nn.Module):
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=32,
+                                       num_channels=in_channels,
+                                       eps=1e-6,
+                                       affine=True)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, hidden_states: torch.Tensor):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query_states = self.q(hidden_states)
+        key_states = self.k(hidden_states)
+        value_states = self.v(hidden_states)
+
+        # compute attention
+        batch_size, channels, height, width = query_states.shape
+        query_states = query_states.reshape(batch_size, channels,
+                                            height * width).permute(0, 2, 1)
+        key_states = key_states.reshape(batch_size, channels, height * width)
+        attn_weights = torch.bmm(query_states, key_states)
+        attn_weights = attn_weights * (int(channels)**(-0.5))
+        attn_weights = F.softmax(attn_weights, dim=2)
+
+        # attend to values
+        value_states = value_states.reshape(batch_size, channels,
+                                            height * width)
+        attn_weights = attn_weights.permute(0, 2, 1)
+        attn_output = torch.bmm(value_states,
+                                attn_weights).reshape(batch_size, channels,
+                                                      height, width)
+
+        attn_output = self.proj_out(attn_output)
+        return residual + attn_output
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoder #noqa
+class ChameleonVQVAEEncoder(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        resolution = config.resolution
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       base_channels,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_channel_multiplier = (1, ) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ChameleonVQVAEEncoderResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    ))
+                block_in = block_out
+                if (config.attn_resolutions is not None
+                        and curr_res in config.attn_resolutions
+                        and config.attn_type == "vanilla"):
+                    attn.append(ChameleonVQVAEEncoderAttnBlock(block_in))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        self.mid = nn.Module()
+        self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+        self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(
+            block_in) if config.attn_type == "vanilla" else nn.Identity()
+        self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32,
+                                           num_channels=block_in,
+                                           eps=1e-6,
+                                           affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * latent_channels if double_latent else latent_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, pixel_values: torch.Tensor):
+        # downsampling
+        hidden_states = [self.conv_in(pixel_values)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                hidden_state = self.down[i_level].block[i_block](
+                    hidden_states[-1], )
+                if len(self.down[i_level].attn) > 0:
+                    hidden_state = self.down[i_level].attn[i_block](
+                        hidden_state)
+                hidden_states.append(hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_states.append(self.down[i_level].downsample(
+                    hidden_states[-1]))
+
+        # middle
+        last_hidden_state = hidden_states[-1]
+        last_hidden_state = self.mid.block_1(last_hidden_state)
+        last_hidden_state = self.mid.attn_1(last_hidden_state)
+        last_hidden_state = self.mid.block_2(last_hidden_state)
+
+        # end
+        last_hidden_state = self.norm_out(last_hidden_state)
+        last_hidden_state *= torch.sigmoid(last_hidden_state)
+        last_hidden_state = self.conv_out(last_hidden_state)
+        return last_hidden_state
+
+
+# Adapted from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAE #noqa
+class ChameleonVQVAE(nn.Module):
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__()
+        self.encoder = ChameleonVQVAEEncoder(config)
+        self.quantize = ChameleonVQVAEVectorQuantizer(config)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels,
+                                          config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim,
+                                               config.latent_channels, 1)
+        self.eval()  # Chameleon's VQ model is frozen
+
+    def encode(
+        self, pixel_values: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quant, emb_loss, indices = self.quantize(hidden_states)
+        return quant, emb_loss, indices
+
+
+# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonImageVocabularyMapping #noqa
+class ChameleonImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map: Dict[str, int]):
+        self.vocab_map = vocab_map
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def val2name(self):
+        return {v: k for k, v in self.vocab_map.items()}
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([
+            val for name, val in self.vocab_map.items()
+            if name.startswith("IMGIMG")
+        ])
+
+    @cached_property
+    def bpe2img(self):
+        img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)}
+
+        def remap(old_name: str) -> str:
+            return "".join(
+                img_tkn_chr_mapping.get(c, c)
+                for c in old_name[len("IMGIMG"):-1])
+
+        return {
+            tok: int(remap(self.val2name[tok]))
+            for tok in self.image_tokens
+        }
+
+    @cached_property
+    def img2bpe(self):
+        return {v: k for k, v in self.bpe2img.items()}
+
+    @cached_property
+    def bpe2img_search_tensors(self):
+        return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(
+            sorted(self.bpe2img.values()))
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+class ChameleonModel(nn.Module):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+        self.vocabulary_mapping = ChameleonImageVocabularyMapping(
+            config.vocabulary_map)
+        decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm \
+            else ChameleonSwinDecoderLayer
+        self.layers = nn.ModuleList([
+            decoder_layer(config=config,
+                          cache_config=cache_config,
+                          quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.vqmodel = ChameleonVQVAE(config.vq_config)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+        """
+        batch_size = pixel_values.shape[0]
+        _, _, image_toks = self.vqmodel.encode(pixel_values)
+        bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks)
+        bpe_toks = bpe_toks.view(batch_size, -1)
+        return bpe_toks
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon)
+class ChameleonForConditionalGeneration(nn.Module, SupportsVision):
+
+    def __init__(
+        self,
+        config: ChameleonConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.model = ChameleonModel(config, cache_config, quant_config)
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT,
+                         CHAMELEON_CROP_SIZE_WIDTH)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return ChameleonImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            assert self.model.vqmodel is not None
+            image_tokens = self.model.get_image_tokens(image_input["data"].to(
+                self.config.torch_dtype))
+            image_token_id = self.model.vocabulary_mapping.image_token_id
+            special_image_mask = input_ids == image_token_id
+            image_tokens = image_tokens.to(input_ids.device, input_ids.dtype)
+            input_ids = input_ids.masked_scatter(special_image_mask,
+                                                 image_tokens)
+
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        # Disallow image tokens which does not include special
+        # begin-image and end-image tokens
+        image_tokens = self.model.vocabulary_mapping.image_tokens
+        logits[:, image_tokens] = torch.finfo(logits.dtype).min
+
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            use_default_weight_loading = False
+            if "vqmodel" in name:
+                if self.model.vqmodel is not None:
+                    # We only do sharding for language model and
+                    # not vqvae for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint (e.g. "
+                                f"{name}), but not found the expected name in "
+                                f"the model (e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            if use_default_weight_loading and name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index e3a5e43e23e1c..553ddf90475b4 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -25,9 +25,11 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 
+from .interfaces import SupportsLoRA
+
 
 class GLMAttention(nn.Module):
 
@@ -301,7 +303,8 @@ def __init__(
         self.encoder = GLMTransformer(config, cache_config, quant_config)
 
         self.output_layer = ParallelLMHead(config.padded_vocab_size,
-                                           config.hidden_size)
+                                           config.hidden_size,
+                                           quant_config=quant_config)
 
     def forward(
         self,
@@ -322,7 +325,7 @@ def forward(
         return hidden_states
 
 
-class ChatGLMForCausalLM(nn.Module):
+class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
@@ -345,12 +348,15 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
-        self.config: ChatGLMConfig = config
+
+        self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
         self.transformer = ChatGLMModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.output_layer.weight
+        self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
         self.sampler = Sampler()
 
@@ -360,6 +366,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -367,7 +374,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index aa4e87228a7e4..b4f628061f19c 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,22 +1,108 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
+from vllm.config import ModelConfig
+from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import SequenceData
 
 
-def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
-    return (image_size // patch_size)**2
+    return image_size // patch_size
+
+
+def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_clip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_image_feature_size(hf_config)
+
+
+def dummy_seq_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def input_processor_for_clip(
+    model_config: ModelConfig,
+    hf_config: CLIPVisionConfig,
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
@@ -39,8 +125,8 @@ def __init__(self, config: CLIPVisionConfig):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(self.image_size,
-                                                self.patch_size)
+        self.num_patches = get_clip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
@@ -101,7 +187,7 @@ def __init__(self,
         self.layer_norm2 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         residual = hidden_states
 
@@ -128,22 +214,24 @@ class CLIPEncoder(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
         self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
         self.layers = nn.ModuleList([
             CLIPEncoderLayer(config=config, quant_config=quant_config)
-            for _ in range(config.num_hidden_layers)
+            for _ in range(num_hidden_layers)
         ])
 
-    def forward(self,
-                inputs_embeds: torch.Tensor,
-                vision_feature_layer: int = -1):
+    def forward(self, inputs_embeds: torch.Tensor):
 
-        # Encoder forward pass only up to the required layer
-        num_layer = len(self.layers) + vision_feature_layer + 1
         hidden_states = inputs_embeds
-        for encoder_layer in self.layers[:num_layer]:
+        for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
 
         return hidden_states
@@ -153,7 +241,8 @@ class CLIPVisionTransformer(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
@@ -163,18 +252,19 @@ def __init__(self,
         # NOTE: This typo of "layrnorm" is not fixed on purpose to match
         # the original transformers code and name of the model weights.
         self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.encoder = CLIPEncoder(config=config, quant_config=quant_config)
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
 
     def forward(
         self,
         pixel_values: torch.Tensor,
-        vision_feature_layer: int = -1,
     ) -> torch.Tensor:
 
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
-        hidden_states = self.encoder(inputs_embeds=hidden_states,
-                                     vision_feature_layer=vision_feature_layer)
+        hidden_states = self.encoder(inputs_embeds=hidden_states)
 
         return hidden_states
 
@@ -186,17 +276,17 @@ class CLIPVisionModel(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
-        self.vision_model = CLIPVisionTransformer(config=config,
-                                                  quant_config=quant_config)
+        self.vision_model = CLIPVisionTransformer(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
 
-    def forward(self,
-                pixel_values: Optional[torch.Tensor] = None,
-                vision_feature_layer: int = -1):
+    def forward(self, pixel_values: Optional[torch.Tensor] = None):
 
-        return self.vision_model(pixel_values=pixel_values,
-                                 vision_feature_layer=vision_feature_layer)
+        return self.vision_model(pixel_values=pixel_values)
 
     @property
     def device(self):
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 600c2990b3691..5f6e3a134f408 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -46,7 +46,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 @torch.compile
@@ -353,6 +353,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -362,12 +363,12 @@ def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         is_not_lora = hasattr(self.model.embed_tokens, 'weight')
         if is_not_lora:
-            embedding_weights = self.model.embed_tokens.weight
+            logits = self.logits_processor(self.model.embed_tokens,
+                                           hidden_states, sampling_metadata)
         else:
-            embedding_weights = self.model.embed_tokens.base_layer.weight
+            logits = self.logits_processor(self.model.embed_tokens.base_layer,
+                                           hidden_states, sampling_metadata)
 
-        logits = self.logits_processor(embedding_weights, hidden_states,
-                                       sampling_metadata)
         return logits
 
     def sample(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 59af42445f323..d758333b22388 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
 
@@ -370,6 +370,7 @@ def __init__(
             config.d_model,
             org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
@@ -381,6 +382,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -388,7 +390,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index e293ee491908d..65b409a2a15a0 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -26,7 +26,7 @@
 from typing import Iterable, Optional, Tuple
 
 import torch
-from transformers import PretrainedConfig
+from transformers import LlamaConfig
 
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.model_executor.layers.quantization.base_config import (
@@ -55,7 +55,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 8fbda2638aaa3..3fd6f2218f3eb 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -48,7 +48,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class DeepseekMLP(nn.Module):
@@ -377,7 +377,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -387,6 +389,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -394,7 +397,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
new file mode 100644
index 0000000000000..2d12ceb7f3dbf
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -0,0 +1,533 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV2 model."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+
+class DeepseekV2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}.")
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.experts = FusedMoE(num_experts=config.n_routed_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                use_grouped_topk=True,
+                                num_expert_group=config.n_group,
+                                topk_group=config.topk_group)
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None)
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx=None,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config)
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config)
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config)
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
+                                                   self.kv_lora_rank +
+                                                   self.qk_rope_head_dim,
+                                                   bias=False,
+                                                   quant_config=quant_config)
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config)
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+        rope_scaling['type'] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # self.attn = Attention(self.num_heads,
+        #                       self.qk_head_dim,
+        #                       self.scaling,
+        #                       num_kv_heads=self.num_heads)
+
+        # TODO, support head_size 192
+        self.attn = Attention(self.num_local_heads,
+                              256,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = DeepseekV2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            layer_idx=layer_idx,
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
+        else:
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekV2Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            DeepseekV2DecoderLayer(config,
+                                   layer_idx,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], attn_metadata,
+                                            residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV2Model(config, cache_config, quant_config)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 9618652f70d23..93f07327eaa26 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -44,7 +44,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import RWConfig
 
 FalconConfig = Union[HF_FalconConfig, RWConfig]
@@ -394,13 +394,13 @@ def __init__(
                                     if config.tie_word_embeddings is not None
                                     else True)
         if self.tie_word_embeddings:
-            self.lm_head_weight = self.transformer.word_embeddings.weight
+            self.lm_head = self.transformer.word_embeddings
         else:
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -410,6 +410,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids,
@@ -421,7 +422,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
new file mode 100644
index 0000000000000..fdea8ee30ce68
--- /dev/null
+++ b/vllm/model_executor/models/fuyu.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Fuyu model."""
+import math
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from PIL import Image
+from transformers import FuyuConfig, FuyuImageProcessor
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.persimmon import PersimmonForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import (cached_get_image_processor,
+                                   cached_get_tokenizer)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
+
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 71011
+_NEWLINE_TOKEN_ID = 71019
+
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
+
+
+class FuyuImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """
+    Shape: 
+    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    """
+
+
+def _calculate_num_image_tokens(
+    height: int,
+    width: int,
+) -> Tuple[int, int]:
+    """
+    calculate number of image tokens needed for a given image size
+    The expected Fuyu image prompts is in format:
+        (image_token * ncols + newline_token) * nrows
+    args:
+        image_size: Tuple[int, int] - (width, height) of the image
+    returns:
+        ncols: int - number of image tokens in x direction
+        nrows: int - number of image tokens in y direction
+    """
+    ncol = math.ceil(width / 30)
+    nrow = math.ceil(height / 30)
+    return ncol, nrow
+
+
+def get_max_fuyu_image_feature_size():
+
+    return _calculate_num_image_tokens(
+        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+    )
+
+
+def get_max_fuyu_image_tokens(ctx: InputContext):
+    ncol, nrow = get_max_fuyu_image_feature_size()
+    return (ncol + 1) * nrow
+
+
+def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int):
+    ncol, nrow = get_max_fuyu_image_feature_size()
+    image_feature_size = get_max_fuyu_image_tokens(ctx)
+
+    token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_fuyu(
+    image_width: int,
+    image_height: int,
+):
+    image = Image.new("RGB", (image_width, image_height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_fuyu(ctx: InputContext, seq_len: int):
+    seq_data = dummy_seq_data_for_fuyu(ctx, seq_len)
+    mm_data = dummy_image_for_fuyu(MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   MAX_IMAGE_FEATURE_SIZE_HEIGHT)
+    return seq_data, mm_data
+
+
+def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
+                           data: Image.Image):
+    image_encoding = image_processor.preprocess(data, return_tensors="pt")
+    batch_images = torch.stack([img[0] for img in image_encoding["images"]
+                                ]).unsqueeze(1)
+    image_unpadded_heights = torch.tensor(
+        image_encoding["image_unpadded_heights"])
+    image_unpadded_widths = torch.tensor(
+        image_encoding["image_unpadded_widths"])
+
+    batch_size = len(image_encoding["images"])
+    image_present = torch.ones(batch_size, 1, 1)
+    model_image_input = image_processor.preprocess_with_tokenizer_info(
+        image_input=batch_images,
+        image_present=image_present,
+        image_unpadded_h=image_unpadded_heights,
+        image_unpadded_w=image_unpadded_widths,
+        image_placeholder_id=_IMAGE_TOKEN_ID,
+        image_newline_id=_NEWLINE_TOKEN_ID,
+        variable_sized=True,
+    )
+    return model_image_input
+
+
+def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    image_data = multi_modal_data["image"]
+    new_multi_modal_data = {}
+    # process image data
+    if isinstance(image_data, Image.Image):
+        # Fuyu's image_processor can also finish token padding
+        image_processor: FuyuImageProcessor = cached_get_image_processor(
+            model_config.model)
+
+        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
+        image_patches = torch.stack([
+            image_patch[0]
+            for image_patch in model_image_input["image_patches"]
+        ])
+        new_multi_modal_data["image"] = image_patches
+
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    # process prompts
+    prompt = llm_inputs["prompt"]
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    tokenizer = cached_get_tokenizer(model_config.model)
+    # dim0 is batch_size, dim1 is subseq_size which will always be 1
+    image_input_ids: List[List[
+        torch.Tensor]] = model_image_input["image_input_ids"]
+    image_input_ids = image_input_ids[0][0].tolist()
+    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
+    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
+
+    new_prompt = prompt + "\x04"
+    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
+        1:] + boa_token
+
+    return LLMInputs(prompt=new_prompt,
+                     prompt_token_ids=new_prompt_token_ids,
+                     multi_modal_data=new_multi_modal_data)
+
+
+def input_mapper_for_fuyu(ctx: InputContext, data: object):
+    model_config = ctx.model_config
+    if isinstance(data, Image.Image):
+        # Fuyu's image_processor can also finish token padding
+        image_processor: FuyuImageProcessor = cached_get_image_processor(
+            model_config.model)
+
+        model_image_input = _fuyu_image_preprocess(image_processor, data)
+        data = torch.stack([
+            image_patch[0]
+            for image_patch in model_image_input["image_patches"]
+        ])
+
+    # image has been processed with prompt in input processor
+    return MultiModalInputs({"image_patches": data})
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+class FuyuForCausalLM(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: FuyuConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.image_token_id = _IMAGE_TOKEN_ID
+        self.image_feature_size = config.patch_size**2 * config.num_channels
+
+        self.vision_embed_tokens = ColumnParallelLinear(
+            self.image_feature_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.language_model = PersimmonForCausalLM(config,
+                                                   cache_config=cache_config,
+                                                   quant_config=quant_config)
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        image_patches = kwargs.pop("image_patches", None)
+
+        if isinstance(image_patches, torch.Tensor):
+            expected_feature_size = self.image_feature_size
+            if image_patches.size(-1) != expected_feature_size:
+                raise ValueError(
+                    f"Expected image patches to have the last dimension of "
+                    f"{expected_feature_size}, got {image_patches.size(-1)}")
+            image_patches = image_patches.to(
+                self.vision_embed_tokens.weight.dtype)
+            return FuyuImagePixelInputs(type="pixel_values",
+                                        data=image_patches)
+        return None
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings, _ = self.vision_embed_tokens(
+                image_input["data"])
+            inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vision_embeddings,
+                                                    self.image_token_id)
+
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.language_model.logits_processor(
+            self.language_model.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.language_model.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Fuyu's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 65f4ebec5bcf0..7e0888b5f5abd 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -26,20 +26,22 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 logger = init_logger(__name__)
 
@@ -146,12 +148,14 @@ def __init__(self,
             quant_config=quant_config,
         )
 
-        self.rotary_emb = get_rope(
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
             self.head_dim,
             rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
+            max_position_embeddings=max_position_embeddings,
             base=self.rope_theta,
             is_neox_style=True,
+            dtype=torch.get_default_dtype(),
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -202,10 +206,10 @@ def __init__(
             hidden_activation=getattr(config, "hidden_activation", None),
             quant_config=quant_config,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -255,7 +259,7 @@ def __init__(
             GemmaDecoderLayer(config, cache_config, quant_config)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
         # The normalizer's data type should be downcasted to the model's
@@ -264,16 +268,23 @@ def __init__(
         normalizer = self.config.hidden_size**0.5
         self.register_buffer("normalizer", torch.tensor(normalizer))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
         hidden_states *= self.normalizer
-
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -288,7 +299,7 @@ def forward(
         return hidden_states
 
 
-class GemmaForCausalLM(nn.Module):
+class GemmaForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -319,21 +330,23 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = GemmaModel(config, cache_config, quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -341,8 +354,8 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.model.embed_tokens.weight,
-                                       hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(
@@ -384,10 +397,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # GemmaRMSNorm is different from Llama's in that it multiplies
-                # (1 + weight) to the output, instead of just weight.
-                if "norm.weight" in name:
-                    loaded_weight += 1.0
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
new file mode 100644
index 0000000000000..8386084c2b3f8
--- /dev/null
+++ b/vllm/model_executor/models/gemma2.py
@@ -0,0 +1,395 @@
+# coding=utf-8
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
+
+
+class Gemma2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+
+    def __init__(self,
+                 layer_idx: int,
+                 config: Gemma2Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 rope_theta: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
+            self.head_dim,
+            self.head_dim,
+            max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        # FIXME(woosuk): While Gemma 2 uses sliding window attention for every
+        # odd layer, vLLM currently ignores it and uses global attention for
+        # all layers.
+        use_sliding_window = (layer_idx % 2 == 1
+                              and config.sliding_window is not None)
+        del use_sliding_window  # Unused.
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        layer_idx: int,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            layer_idx=layer_idx,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+class Gemma2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            Gemma2DecoderLayer(layer_idx, config, cache_config, quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        hidden_states *= self.normalizer
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma2Model(config, cache_config, quant_config)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index cc83f6eb6d94d..94cd67e75336a 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -25,7 +25,8 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -38,7 +39,9 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .utils import is_pp_missing_parameter, make_layers
 
 
 class GPT2Attention(nn.Module):
@@ -48,6 +51,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -65,12 +69,14 @@ def __init__(
             total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -98,6 +104,7 @@ def __init__(
         intermediate_size: int,
         config: GPT2Config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -106,12 +113,14 @@ def __init__(
             intermediate_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.act = get_act_fn(config.activation_function, quant_config,
                               intermediate_size)
@@ -130,6 +139,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -137,9 +147,15 @@ def __init__(
                      hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config, cache_config, quant_config)
+        self.attn = GPT2Attention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPT2MLP(inner_dim, config, quant_config)
+        self.mlp = GPT2MLP(inner_dim,
+                           config,
+                           quant_config,
+                           prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -172,6 +188,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -181,10 +198,11 @@ def __init__(
         self.embed_dim = config.hidden_size
         self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.h = nn.ModuleList([
-            GPT2Block(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GPT2Block(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -193,14 +211,24 @@ def forward(
         position_ids: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            inputs_embeds = self.wte(input_ids)
+            position_embeds = self.wpe(position_ids)
+            hidden_states = inputs_embeds + position_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(len(self.h)):
+        for i in range(self.start_layer, self.end_layer):
             layer = self.h[i]
-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
+            hidden_states = layer(hidden_states,
+                                  kv_caches[i - self.start_layer],
+                                  attn_metadata)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
 
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
@@ -217,8 +245,11 @@ def __init__(
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.transformer = GPT2Model(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.transformer = GPT2Model(config,
+                                     cache_config,
+                                     quant_config,
+                                     prefix="transformer")
+        self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -228,14 +259,15 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata)
+                                         attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -247,6 +279,16 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         for name, loaded_weight in weights:
@@ -260,6 +302,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 continue
             if not name.startswith("transformer."):
                 name = "transformer." + name
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
             param = params_dict[name]
             # The HF's GPT-2 implementation uses Conv1D instead of Linear.
             # Because of this, we need to transpose the weights.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index b15ed11988c27..fc4e13bbb0e68 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -39,7 +39,9 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 class GPTBigCodeAttention(nn.Module):
@@ -230,10 +232,10 @@ def forward(
         return hidden_states
 
 
-class GPTBigCodeForCausalLM(nn.Module):
+class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
-    supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
+    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
 
     embedding_modules = {
         "wte": "input_embeddings",
@@ -250,11 +252,14 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(config, cache_config, quant_config,
                                            lora_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -268,6 +273,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -275,7 +281,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 47fd5788a4c35..4bb9debe7ae81 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -38,7 +38,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class GPTJAttention(nn.Module):
@@ -229,6 +229,7 @@ def __init__(
             config.vocab_size,
             config.n_embd,
             bias=True,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -239,6 +240,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -246,7 +248,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index eb0fcc8f26a58..b306574b2ed92 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -38,7 +38,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class GPTNeoXAttention(nn.Module):
@@ -241,6 +241,7 @@ def __init__(
         self.embed_out = ParallelLMHead(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -251,6 +252,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
                                       attn_metadata)
@@ -258,7 +260,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.embed_out.weight, hidden_states,
+        logits = self.logits_processor(self.embed_out, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
new file mode 100644
index 0000000000000..6fdacd4469788
--- /dev/null
+++ b/vllm/model_executor/models/interfaces.py
@@ -0,0 +1,190 @@
+from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type,
+                    Union, overload, runtime_checkable)
+
+from typing_extensions import TypeGuard
+
+from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@runtime_checkable
+class SupportsVision(Protocol):
+    """The interface required for all vision language models (VLMs)."""
+
+    supports_vision: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports vision inputs.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def __init__(self, *, multimodal_config: MultiModalConfig) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsVisionType(Protocol):
+    supports_vision: Literal[True]
+
+    def __call__(self, *, multimodal_config: MultiModalConfig) -> None:
+        ...
+
+
+@overload
+def supports_vision(model: Type[object]) -> TypeGuard[Type[SupportsVision]]:
+    ...
+
+
+@overload
+def supports_vision(model: object) -> TypeGuard[SupportsVision]:
+    ...
+
+
+def supports_vision(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsVision]], TypeGuard[SupportsVision]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsVisionType)
+
+    return isinstance(model, SupportsVision)
+
+
+@runtime_checkable
+class SupportsLoRA(Protocol):
+    """The interface required for all models that support LoRA."""
+
+    supports_lora: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports LoRA.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    packed_modules_mapping: ClassVar[Dict[str, List[str]]]
+    supported_lora_modules: ClassVar[List[str]]
+    embedding_modules: ClassVar[Dict[str, str]]
+    embedding_padding_modules: ClassVar[List[str]]
+
+    # lora_config is None when LoRA is not enabled
+    def __init__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsLoRAType(Protocol):
+    supports_lora: Literal[True]
+
+    packed_modules_mapping: Dict[str, List[str]]
+    supported_lora_modules: List[str]
+    embedding_modules: Dict[str, str]
+    embedding_padding_modules: List[str]
+
+    def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+@overload
+def supports_lora(model: Type[object]) -> TypeGuard[Type[SupportsLoRA]]:
+    ...
+
+
+@overload
+def supports_lora(model: object) -> TypeGuard[SupportsLoRA]:
+    ...
+
+
+def supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    result = _supports_lora(model)
+
+    if not result:
+        lora_attrs = (
+            "packed_modules_mapping",
+            "supported_lora_modules",
+            "embedding_modules",
+            "embedding_padding_modules",
+        )
+        missing_attrs = tuple(attr for attr in lora_attrs
+                              if not hasattr(model, attr))
+
+        if getattr(model, "supports_lora", False):
+            if missing_attrs:
+                logger.warning(
+                    "The model (%s) sets `supports_lora=True`, "
+                    "but is missing LoRA-specific attributes: %s",
+                    model,
+                    missing_attrs,
+                )
+        else:
+            if not missing_attrs:
+                logger.warning(
+                    "The model (%s) contains all LoRA-specific attributes, "
+                    "but does not set `supports_lora=True`.", model)
+
+    return result
+
+
+def _supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsLoRAType)
+
+    return isinstance(model, SupportsLoRA)
+
+
+@runtime_checkable
+class HasInnerState(Protocol):
+    """The interface required for all models that has inner state."""
+
+    has_inner_state: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has inner state.
+        Models that has inner state usually need access to the scheduler_config
+        for max_num_seqs ,etc... (Currently only used by Jamba)
+    """
+
+    def __init__(self,
+                 *,
+                 scheduler_config: Optional[SchedulerConfig] = None) -> None:
+        ...
+
+
+@runtime_checkable
+class _HasInnerStateType(Protocol):
+    has_inner_state: ClassVar[Literal[True]]
+
+    def __init__(self,
+                 *,
+                 scheduler_config: Optional[SchedulerConfig] = None) -> None:
+        ...
+
+
+@overload
+def has_inner_state(model: object) -> TypeGuard[HasInnerState]:
+    ...
+
+
+@overload
+def has_inner_state(model: Type[object]) -> TypeGuard[Type[HasInnerState]]:
+    ...
+
+
+def has_inner_state(
+    model: Union[Type[object], object]
+) -> Union[TypeGuard[Type[HasInnerState]], TypeGuard[HasInnerState]]:
+    if isinstance(model, type):
+        return isinstance(model, _HasInnerStateType)
+
+    return isinstance(model, HasInnerState)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index e75c567f589c8..22132f40fc5e6 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -22,7 +22,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class InternLM2MLP(nn.Module):
@@ -253,7 +253,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = InternLM2Model(config, cache_config, quant_config)
-        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.output = ParallelLMHead(config.vocab_size,
+                                     config.hidden_size,
+                                     quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -263,6 +265,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: IntermediateTensors,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -270,7 +273,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.output.weight, hidden_states,
+        logits = self.logits_processor(self.output, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 869b8fc91fd64..0030c761d34db 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -40,7 +40,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import JAISConfig
 
 
@@ -273,7 +273,7 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.transformer = JAISModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         if hasattr(config, "width_scale"):
             self.output_logits_scale = config.width_scale
         else:
@@ -289,6 +289,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -296,7 +297,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
new file mode 100644
index 0000000000000..d4e4f0055aa2b
--- /dev/null
+++ b/vllm/model_executor/models/jamba.py
@@ -0,0 +1,969 @@
+# coding=utf-8
+"""Inference-only Jurassic model."""
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple
+
+import torch
+from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import JambaConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import HasInnerState
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
+                                      _get_graph_batch_size)
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+@dataclass
+class MambaCacheParams:
+    is_prompt: bool = False
+    conv_state: torch.Tensor = torch.Tensor()
+    ssm_state: torch.Tensor = torch.Tensor()
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class JambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(self.hidden_size,
+                                                  [self.intermediate_size] * 2,
+                                                  bias=self.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(self.time_step_rank,
+                                            self.intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+
+        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+            param.data.copy_(
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
+                                         dim=0)[tp_rank])
+
+        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            weight_loader(param, -torch.exp(loaded_weight.float()))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": weight_loader})
+        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+        )
+        self.activation = config.hidden_act
+
+        self.dt_layernorm = RMSNorm(self.time_step_rank,
+                                    eps=config.rms_norm_eps)
+        self.b_layernorm = RMSNorm(self.ssm_state_size,
+                                   eps=config.rms_norm_eps)
+        self.c_layernorm = RMSNorm(self.ssm_state_size,
+                                   eps=config.rms_norm_eps)
+
+    def mamba_forward(self,
+                      hidden_states: torch.Tensor,
+                      cache_params: MambaCacheParams = None):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(1, 2)
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+        if cache_params is not None and not cache_params.is_prompt:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if cache_params is not None:
+                conv_states = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_state.copy_(conv_states)
+
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+            )
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+        time_step = self.dt_layernorm(time_step.contiguous())
+        B = self.b_layernorm(B.contiguous())
+        C = self.c_layernorm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(1, 2)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+        if cache_params is not None and not cache_params.is_prompt:
+            scan_outputs = selective_state_update(
+                cache_params.ssm_state,
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                self.A,
+                B[:, 0],
+                C[:, 0],
+                self.D,
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            scan_outputs, ssm_state = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                self.A,
+                B.transpose(1, 2),
+                C.transpose(1, 2),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
+            )
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_state.copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))[0]
+        return contextualized_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ):
+        if attn_metadata.prefill_metadata is not None:
+            offset = 0
+            for i, prompt_len in enumerate(
+                    attn_metadata.prefill_metadata.seq_lens):
+                cache = MambaCacheParams(True,
+                                         conv_state=conv_state[i].unsqueeze(0),
+                                         ssm_state=ssm_state[i].unsqueeze(0))
+                hidden_states[offset:offset + prompt_len].copy_(
+                    self.mamba_forward(hidden_states[offset:offset +
+                                                     prompt_len].unsqueeze(0),
+                                       cache_params=cache)[0])
+                offset += prompt_len
+        else:
+            cache = MambaCacheParams(False,
+                                     conv_state=conv_state,
+                                     ssm_state=ssm_state)
+            hidden_states = self.mamba_forward(hidden_states.unsqueeze(1),
+                                               cache_params=cache)
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class JambaMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        hidden_act = config.hidden_act
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class JambaMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // self.tp_size
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = ReplicatedLinear(self.hidden_size,
+                                       self.num_total_experts,
+                                       bias=False,
+                                       params_dtype=self.params_dtype)
+
+        self.ws = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.hidden_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            ))
+        self.w2s = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                self.hidden_size,
+                self.intermediate_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            ))
+
+        set_weight_attrs(
+            self.ws,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2s,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        expert_id: int,
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("gate_proj.weight"):
+            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("up_proj.weight"):
+            param_data[expert_id,
+                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("down_proj.weight"):
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits, _ = self.router(hidden_states)
+
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.ws,
+            self.w2s,
+            router_logits,
+            self.top_k,
+            renormalize=
+            False,  # Mixtral normalize the expert probs to 1. We don't!
+            inplace=True,
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class JambaMambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.mamba = JambaMambaMixer(config, layer_idx)
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.mamba(hidden_states, attn_metadata, conv_state,
+                                   ssm_state)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class JambaAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+        )
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": JambaAttentionDecoderLayer,
+    "mamba": JambaMambaDecoderLayer
+}
+
+
+class JambaModel(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
+            decoder_layers.append(
+                layer_class(config,
+                            layer_idx=i,
+                            cache_config=cache_config,
+                            quant_config=quant_config))
+        self.layers = nn.ModuleList(decoder_layers)
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            kv_cache = None
+            current_ssm_state = None
+            current_conv_state = None
+            if isinstance(layer, JambaAttentionDecoderLayer):
+                kv_cache = kv_caches[(i - self.config.attn_layer_offset) //
+                                     self.config.attn_layer_period]
+            if isinstance(layer, JambaMambaDecoderLayer):
+                current_state_layer = i - (1 +
+                                           (i - self.config.attn_layer_offset)
+                                           // self.config.attn_layer_period)
+                current_ssm_state = ssm_state[current_state_layer]
+                current_conv_state = conv_state[current_state_layer]
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                kv_cache=kv_cache,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                conv_state=current_conv_state,
+                ssm_state=current_ssm_state,
+            )
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+
+
+class JambaForCausalLM(nn.Module, HasInnerState):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        scheduler_config: Optional[SchedulerConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = JambaModel(config,
+                                cache_config=cache_config,
+                                quant_config=quant_config,
+                                lora_config=lora_config)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Current step used indices
+        self.current_indices: List[int] = []
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple()
+        # Used as an input_buffer for the CUDA graph runs.
+        self.mamba_gc_cache_buffer: Tuple[torch.Tensor, torch.Tensor] = tuple()
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the self.mamba_cache
+        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[KVCache],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs):
+        if not self.mamba_cache:
+            self._prepare_mamba_cache()
+
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            assert all(
+                key in kwargs
+                for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+            self._release_mamba_cache(finished_requests_ids)
+            batch_size = input_ids.shape[0]
+            if attn_metadata.prefill_metadata:
+                batch_size = len(request_ids_to_seq_ids)
+            (
+                current_seqlen_agnostic_cache,
+                indices,
+            ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                                      batch_size,
+                                                      finished_requests_ids)
+        else:
+            # CUDA graph capturing runs
+            current_seqlen_agnostic_cache, indices = (
+                kwargs["seqlen_agnostic_capture_inputs"],
+                [],
+            )
+        self.current_indices = indices
+
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata,
+                                   current_seqlen_agnostic_cache[0],
+                                   current_seqlen_agnostic_cache[1])
+
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            self._copy_mamba_cache_by_indices(self.current_indices,
+                                              current_seqlen_agnostic_cache)
+
+        return hidden_states
+
+    def _copy_mamba_cache_by_indices(
+            self, indices: List[int],
+            current_seqlen_agnostic_cache: Tuple[torch.Tensor, torch.Tensor]):
+        for i, offset in enumerate(indices):
+            self._copy_mamba_cache(offset, i, current_seqlen_agnostic_cache)
+
+    def _copy_mamba_cache(self, index_to: int, index_from: int,
+                          from_buffer: Tuple[torch.Tensor, torch.Tensor]):
+        assert len(self.mamba_cache) > 0
+        for (cache_t, from_buffer_t) in zip(self.mamba_cache, from_buffer):
+            cache_t[:, index_to].copy_(from_buffer_t[:, index_from],
+                                       non_blocking=True)
+
+    def _assign_seq_id_to_mamba_cache(self, cur_rid: str,
+                                      seqs_id: List[int]) -> List[int]:
+        indices_for_current_run = []
+        for seq_id in seqs_id:
+            if cur_rid not in self.mamba_cache_indices_mapping:
+                self.mamba_cache_indices_mapping[cur_rid] = {}
+                first_free_index = self._first_free_index_in_mamba_cache()
+                self.mamba_cache_indices_mapping[cur_rid][
+                    seq_id] = first_free_index
+                index_for_current_run = first_free_index
+            ## case of decoding n>1, copy prefill cache to decoding indices
+            elif seq_id not in (seq_ids2indices :=
+                                self.mamba_cache_indices_mapping[cur_rid]):
+                first_free_index = self._first_free_index_in_mamba_cache()
+                index_exist = list(seq_ids2indices.values())[0]
+                self._copy_mamba_cache(index_from=index_exist,
+                                       index_to=first_free_index,
+                                       from_buffer=self.mamba_cache)
+                self.mamba_cache_indices_mapping[cur_rid][
+                    seq_id] = first_free_index
+                index_for_current_run = first_free_index
+            else:
+                index_for_current_run = self.mamba_cache_indices_mapping[
+                    cur_rid][seq_id]
+
+            indices_for_current_run.append(index_for_current_run)
+        return indices_for_current_run
+
+    def _prepare_current_run_mamba_cache(
+        self, request_ids_to_seq_ids: Dict[str, list[int]], batch_size: int,
+        finished_requests_ids: List[str]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], List[int]]:
+        indices_for_current_run = []
+        for request_id, seqs_id in request_ids_to_seq_ids.items():
+            if request_id in finished_requests_ids:
+                # Do not allocate cache for requests that run
+                # and finish right after
+                continue
+            indices_for_current_run += self._assign_seq_id_to_mamba_cache(
+                request_id, seqs_id)
+        ## Pad the batch in case of running batch that was not captured via CG
+        padded_indices = indices_for_current_run.copy()
+        pad_index = self._first_free_index_in_mamba_cache()
+
+        for _ in range(batch_size - len(indices_for_current_run)):
+            padded_indices.append(pad_index)
+
+        conv_state = self.mamba_cache[0][:, padded_indices]
+        temporal_state = self.mamba_cache[1][:, padded_indices]
+
+        return (conv_state, temporal_state), indices_for_current_run
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant Mamba cache into the CUDA graph input buffer 
+        that was provided during the capture runs 
+        (JambaForCausalLM.mamba_gc_cache_buffer). 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        self._release_mamba_cache(finished_requests_ids)
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        cg_batch_size = input_buffers['input_ids'].shape[0]
+        (
+            current_mamba_cache,
+            indices,
+        ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                                  cg_batch_size,
+                                                  finished_requests_ids)
+        self.current_indices = indices
+
+        for input_buffer, current_cache_buffer in zip(
+                input_buffers["seqlen_agnostic_capture_inputs"],
+                current_mamba_cache):
+            input_buffer.copy_(current_cache_buffer, non_blocking=True)
+
+    def copy_outputs_after_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant Mamba cache from the CUDA graph input_buffers
+        back to the JambaForCausalLM.mamba_cache after CUDA 
+        graph replay run is done.
+        """
+        self._copy_mamba_cache_by_indices(
+            self.current_indices,
+            input_buffers["seqlen_agnostic_capture_inputs"])
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Mamba Cache during the CUDA graph 
+        replay runs.
+        """
+        return tuple(buffer[:, :batch_size]
+                     for buffer in self.mamba_gc_cache_buffer)
+
+    def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.mamba_cache_indices_mapping:
+                self.mamba_cache_indices_mapping.pop(req_id)
+
+    def _first_free_index_in_mamba_cache(self) -> int:
+        if self.mamba_cache:
+            max_possible_batch_size = self.mamba_cache[0].shape[1]
+            occupied = [
+                id for seq_ids in self.mamba_cache_indices_mapping.values()
+                for id in seq_ids.values()
+            ]
+            first_free_index = [
+                i not in occupied for i in range(max_possible_batch_size)
+            ].index(True)
+            return first_free_index
+        return 0
+
+    def _get_mamba_cache_shape(
+            self
+    ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+        conv_state_shape = (
+            self.config.mamba_expand * hidden_size // world_size,
+            self.config.mamba_d_conv,
+        )
+        temporal_state_shape = (
+            self.config.mamba_expand * self.config.hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def _prepare_mamba_cache(self):
+        dtype = self.lm_head.weight.dtype
+        layers_type = self.config.layers_block_type
+        mamba_layers = sum(
+            [layer_type == "mamba" for layer_type in layers_type])
+        max_batch_size = (_get_graph_batch_size(
+            self.scheduler_config.max_num_seqs) if self.scheduler_config else
+                          max(_BATCH_SIZES_TO_CAPTURE)) + 10
+        conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape()
+        assert conv_state_shape is not None and temporal_state_shape is not None
+
+        for buffername in ["mamba_cache", "mamba_gc_cache_buffer"]:
+            buffer = (torch.empty(size=(mamba_layers, max_batch_size) +
+                                  conv_state_shape,
+                                  dtype=dtype,
+                                  device="cuda"),
+                      torch.empty(size=(mamba_layers, max_batch_size) +
+                                  temporal_state_shape,
+                                  dtype=dtype,
+                                  device="cuda"))
+            setattr(self, buffername, buffer)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["gate_proj", "up_proj"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            ) for expert_id in range(self.config.num_experts)
+            for weight_name in ["down_proj", "up_proj", "gate_proj"]
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if 'experts' in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d83ee9a201c0b..2052c443a8885 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -39,15 +39,20 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader)
+    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
-from vllm.utils import is_hip, print_warning_once
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import is_hip
+
+from .interfaces import SupportsLoRA
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
 
 
 class LlamaMLP(nn.Module):
@@ -59,17 +64,20 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=hidden_size,
             output_sizes=[intermediate_size] * 2,
             bias=bias,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(input_size=intermediate_size,
                                            output_size=hidden_size,
                                            bias=bias,
-                                           quant_config=quant_config)
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -86,6 +94,7 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
+        config: LlamaConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -95,6 +104,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -112,7 +122,9 @@ def __init__(
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -126,12 +138,14 @@ def __init__(
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -170,6 +184,7 @@ def __init__(
         config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -186,6 +201,7 @@ def __init__(
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
         self.self_attn = LlamaAttention(
+            config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=getattr(config, "num_key_value_heads",
@@ -196,6 +212,7 @@ def __init__(
             quant_config=quant_config,
             bias=attention_bias,
             cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,
@@ -203,6 +220,7 @@ def __init__(
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -246,6 +264,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -254,18 +273,26 @@ def __init__(
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config=config,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
-            for idx in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: LlamaDecoderLayer(config=config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -276,27 +303,41 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
-class LlamaForCausalLM(nn.Module):
+class LlamaForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -336,30 +377,39 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = LlamaModel(config,
                                 cache_config,
                                 quant_config,
-                                lora_config=lora_config)
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-        )
-        if config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
-        self.sampler = Sampler()
+                                lora_config=lora_config,
+                                prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
 
     def forward(
         self,
@@ -367,14 +417,15 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
-        return hidden_states
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, kv_caches,
+                                  attn_metadata, intermediate_tensors)
+        return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -386,6 +437,20 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -404,6 +469,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -411,27 +484,27 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
+
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 # Remapping the name of FP8 kv-scale.
-                if name.endswith("kv_scale"):
-                    remapped_kv_scale_name = name.replace(
-                        ".kv_scale", ".attn.kv_scale")
-                    if remapped_kv_scale_name not in params_dict:
-                        print_warning_once(
-                            f"Found kv scale in the checkpoint (e.g. {name}), "
-                            "but not found the expected name in the model "
-                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
-                            "not loaded.")
-                        continue
-                    else:
-                        name = remapped_kv_scale_name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -447,7 +520,8 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 quantization_param_path, tp_rank, tp_size,
                 self.config.num_hidden_layers,
                 self.config.__class__.model_type):
-            layer_self_attn = self.model.layers[layer_idx].self_attn
+            if not isinstance(self.model.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.model.layers[layer_idx].self_attn
 
             if is_hip():
                 # The scaling factor convention we are assuming is
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8e36c54b1c511..4e7e6c47f0a0b 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,11 +1,12 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
-from transformers import LlavaConfig
+from transformers import CLIPVisionConfig, LlavaConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -17,10 +18,12 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
-from .vlm_base import VisionLanguageModelBase
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_max_clip_image_tokens, input_processor_for_clip)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
 
 _KEYS_TO_MODIFY_MAPPING = {
     "language_model.lm_head": "lm_head",
@@ -50,59 +53,92 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def merge_vision_embeddings(input_ids: torch.Tensor,
-                            inputs_embeds: torch.Tensor,
-                            vision_embeddings: torch.Tensor,
-                            image_token_id: int) -> torch.Tensor:
-    """In place merges in vision_embeddings with inputs_embeds."""
-    mask = (input_ids == image_token_id)
+class LlavaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size, num_channels, height, width)`"""
 
-    image_feature_size = vision_embeddings.shape[0] * vision_embeddings.shape[1]
-    if mask.sum() != image_feature_size:
-        raise ValueError(f"image_feature_size should be {image_feature_size}, "
-                         f"but found: {mask.sum()}")
 
-    inputs_embeds[mask] = vision_embeddings.view(image_feature_size,
-                                                 vision_embeddings.shape[-1])
+LlavaImageInputs = LlavaImagePixelInputs
 
-    return inputs_embeds
 
+def get_max_llava_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
 
-class LlavaImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, num_channels, height, width)"""
+    if isinstance(vision_config, CLIPVisionConfig):
+        return get_max_clip_image_tokens(vision_config)
 
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
-class LlavaImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, image_feature_size, hidden_size)"""
 
+def dummy_data_for_llava(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+        )
 
-LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
+        mm_data = dummy_image_for_clip(vision_config)
+        return seq_data, mm_data
 
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class LlavaForConditionalGeneration(VisionLanguageModelBase):
+
+def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return input_processor_for_clip(
+            model_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
         self.config = config
+        self.multimodal_config = multimodal_config
 
-        if self.vision_language_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config.vision_config)
+        # Initialize the vision tower only up to the required feature layer
+        vision_feature_layer = config.vision_feature_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
         else:
-            self.vision_tower = None
+            num_hidden_layers = vision_feature_layer + 1
 
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = CLIPVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -115,66 +151,42 @@ def __init__(self,
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size)
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+                                                config.text_config.vocab_size,
+                                                logit_scale)
         self.sampler = Sampler()
 
-    def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != list(
-                self.vision_language_config.image_input_shape[1:]):
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
             raise ValueError(
-                f"The expected image tensor shape is batch dimension plus "
-                f"{self.vision_language_config.image_input_shape[1:]}. "
-                f"You supplied {data.shape}. "
-                f"If you are using vLLM's entrypoint, make sure your "
-                f"supplied image input is consistent with "
-                f"image_input_shape in engine args.")
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
 
         return data
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        image_features = kwargs.pop("image_features", None)
-
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
-
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return LlavaImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_data(pixel_values),
-            )
 
-        if expected_input_type == ImageInputType.IMAGE_FEATURES:
-            if pixel_values is not None:
-                raise ValueError(
-                    "Expected image features but got pixel values")
-            if image_features is None:
-                return None
+        if pixel_values is None:
+            return None
 
-            if not isinstance(image_features, torch.Tensor):
-                raise ValueError("Incorrect type of image features. "
-                                 f"Got type: {type(image_features)}")
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-            return LlavaImageFeatureInputs(
-                type="image_features",
-                data=self._validate_image_data(image_features),
-            )
-
-        return None
+        return LlavaImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -191,8 +203,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
-                                      self.config.vision_feature_layer)
+        image_features = vision_tower(pixel_values)
 
         return self._select_image_features(
             image_features,
@@ -209,12 +220,8 @@ def _process_image_pixels(self,
 
     def _process_image_input(self,
                              image_input: LlavaImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
-
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
     def forward(
@@ -223,47 +230,43 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: object,
     ) -> SamplerOutput:
         """Run forward pass for LLaVA-1.5.
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
+
         Concretely, consider a text prompt:
-        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
         Tokenizer outputs:
-        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
-        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
-        The to-be-inserted image has a size of 576 (24 * 24) along the context
-        length dimension.
-        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
-        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
-        9047, 13566, 29901].
-        There will be 576 `32000` in the `input_ids`.
-        (32000 is the token id for `<image>`.)
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        This model has two modes of image inputs:
-        `PIXEL_VALUES` and `IMAGE_FEATURES`.
-
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each input image.
-                Expects a batch with shape `[1, 3, 336, 336]`.
-                (Only applicable to `PIXEL_VALUES` mode)
-            image_features: The image features for each input image outputted by
-                the vision tower before passing to the multi-modal projector.
-                Expects a batch with shape `[1, 576, 1024]`.
-                (Only applicable to `IMAGE_FEATURES` mode)
-
+        
         See also:
-            Each input maps to huggingface implementation, as follows:
-
-            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
-            - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
+            :class:`LlavaImageInputs`
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
@@ -273,7 +276,7 @@ def forward(
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.config.image_token_index)
 
             input_ids = None
         else:
@@ -283,13 +286,14 @@ def forward(
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -338,7 +342,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     use_default_weight_loading = True
-            if use_default_weight_loading:
+            if use_default_weight_loading and name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c1158c933c88b..5abb55c2cc415 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,16 +1,16 @@
-from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
-                    Union)
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
 from PIL import Image
-from transformers import LlavaNextConfig
+from transformers import CLIPVisionConfig, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -21,12 +21,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
-from .llava import LlavaMultiModalProjector, merge_vision_embeddings
-from .vlm_base import VisionLanguageModelBase
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_patch_grid_length, input_processor_for_clip)
+from .interfaces import SupportsVision
+from .llava import LlavaMultiModalProjector
+from .utils import merge_vision_embeddings
 
 logger = init_logger(__name__)
 
@@ -35,95 +37,202 @@
     "language_model.model": "language_model",
 }
 
+# Result in the max possible feature size (2x2 grid of 336x336px tiles)
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
+
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    """
 
     image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
+    """
+    Shape: `(batch_size, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+LlavaNextImageInputs = LlavaNextImagePixelInputs
+
+
+# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
+# NOTE: new_height and new_width are further incremented to properly invert the
+# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
+def _get_llava_next_num_unpadded_features(
+    height: int,
+    width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+    current_height = torch.tensor(current_height).to("cuda")
+    current_width = torch.tensor(current_width).to("cuda")
+
+    aspect_ratio: float = width / height
+    current_aspect_ratio: float = current_width / current_height
+    if aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / width
+        new_height = int(height * scale_factor)
+        padding = (current_height - new_height) // 2
+        current_height -= padding * 2
+    else:
+        scale_factor = current_height / height
+        new_width = int(width * scale_factor)
+        padding = (current_width - new_width) // 2
+        current_width -= padding * 2
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
+def get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        num_patches = get_clip_patch_grid_length(
+            image_size=vision_config.image_size,
+            patch_size=vision_config.patch_size,
+        )
+        base_feature_size = num_patches * num_patches
+
+        # Note: We follow the "wrong" width/height order
+        # [ref: PR huggingface/transformers#31588]
+        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+            image_size=(input_height, input_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_config.image_size,
+        )
 
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
+                                                  num_patches,
+                                                  num_patch_height,
+                                                  num_patch_width)
 
-class LlavaNextImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
+        return unpadded_feature_size + newline_feature_size + base_feature_size
 
-    image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
 
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
-                             LlavaNextImageFeatureInputs]
+def get_max_llava_next_image_tokens(ctx: InputContext):
 
+    return get_llava_next_image_feature_size(
+        ctx.get_hf_config(LlavaNextConfig),
+        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+    )
 
-def _get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
-                                                  vlm_config)
 
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
+def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
 
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        _, c, h, w = vlm_config.image_input_shape
-        mode = {1: "L", 3: "RGB"}[c]
-        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+    image_feature_size = get_max_llava_next_image_tokens(ctx)
 
-    return seq_data, fake_mm_data
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
+
+        mm_data = dummy_image_for_clip(
+            vision_config,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
 
+        return seq_data, mm_data
 
-def _image_pixel_processor(
-    data: ImagePixelData,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Dict[str, torch.Tensor]:
-    image = data.image
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
-    if isinstance(image, torch.Tensor):
-        pixel_values = image.to(model_config.dtype)
-        batch_size, _, _, h, w = pixel_values.shape
-        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
 
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
 
-    # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = vlm_config.image_input_shape
-    if (w, h) != (image.width, image.height):
-        logger.warning(
-            "Dynamic image shape is currently not supported. "
-            "Resizing input image to (%d, %d).", w, h)
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
 
-        data.image = image.resize((w, h))
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        width, height = image_data.size
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_processor(data, model_config, vlm_config)
+        image_feature_size = get_llava_next_image_feature_size(
+            hf_config,
+            input_height=height,
+            input_width=width,
+        )
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return input_processor_for_clip(
+            model_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
 
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
-@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
-class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
+class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaNextConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
-        # Update the type annotation from that of its superclass
         self.config = config
+        self.multimodal_config = multimodal_config
 
-        if self.vision_language_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config=config.vision_config)
+        # Initialize the vision tower only up to the required feature layer
+        vision_feature_layer = config.vision_feature_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
         else:
-            raise TypeError("Image features are not supported by LLaVA-NeXT")
+            num_hidden_layers = vision_feature_layer + 1
 
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = CLIPVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -136,33 +245,17 @@ def __init__(self,
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size)
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+                                                config.text_config.vocab_size,
+                                                logit_scale)
         self.sampler = Sampler()
 
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
-    def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
-        _, num_channels, _, _ = self.vision_language_config.image_input_shape
-
-        # Note that this is different from that of vLLM vision_language_config
-        # since the image is resized by the HuggingFace preprocessor
-        height = width = self.config.vision_config.image_size
-
-        if list(data.shape[2:]) != [num_channels, height, width]:
-            raise ValueError(
-                f"The expected image tensor shape is batch dimension plus "
-                f"num_patches plus {[num_channels, height, width]}. "
-                f"You supplied {data.shape}. "
-                f"If you are using vLLM's entrypoint, make sure your "
-                f"supplied image input is consistent with "
-                f"image_input_shape in engine args.")
-
-        return data
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         if list(data.shape[1:]) != [2]:
             raise ValueError(
@@ -171,40 +264,48 @@ def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
 
         return data
 
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_sizes = kwargs.pop("image_sizes", None)
-        image_features = kwargs.pop("image_features", None)
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
 
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
 
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
 
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaNextImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
 
-            if not isinstance(image_sizes, torch.Tensor):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
+        if pixel_values is None:
+            return None
 
-            return LlavaNextImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_pixels(pixel_values),
-                image_sizes=self._validate_image_sizes(image_sizes),
-            )
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
-            "Failed to validate this at initialization time")
+        if not isinstance(image_sizes, torch.Tensor):
+            raise ValueError("Incorrect type of image sizes. "
+                             f"Got type: {type(image_sizes)}")
 
-        return None
+        return LlavaNextImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+            image_sizes=self._validate_image_sizes(image_sizes),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -221,23 +322,21 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
-                                      self.config.vision_feature_layer)
+        image_features = vision_tower(pixel_values)
 
         return self._select_image_features(
             image_features,
             strategy=self.config.vision_feature_select_strategy,
         )
 
+    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
     def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                                       patch_embeddings: torch.Tensor, *,
                                       strategy: str) -> torch.Tensor:
-        # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
         if strategy == "flat":
             return patch_embeddings.flatten(0, 1)
 
         if strategy.startswith("spatial"):
-            orig_width, orig_height = image_size
             height = width = self.config.vision_config.image_size \
                 // self.config.vision_config.patch_size
 
@@ -251,13 +350,15 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                 other_patch_embeds = patch_embeddings[1:]
 
                 # image_aspect_ratio == "anyres"
+                # Note: We follow the "wrong" width/height order
+                # [ref: PR huggingface/transformers#31588]
                 num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    (orig_width, orig_height),
+                    image_size,
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
                 other_patch_embeds = other_patch_embeds \
-                    .view(num_patch_width, num_patch_height, height, width, -1)
+                    .view(num_patch_height, num_patch_width, height, width, -1)
 
                 if "unpad" in strategy:
                     other_patch_embeds = other_patch_embeds \
@@ -295,71 +396,93 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
         raise ValueError(f"Unexpected patch merge strategy: {strategy}")
 
     def _process_image_pixels(
-            self, inputs: LlavaNextImagePixelInputs) -> torch.Tensor:
+        self,
+        inputs: LlavaNextImagePixelInputs,
+    ) -> BatchedTensors:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
 
-        b, num_patches, c, h, w = pixel_values.shape
-        stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+        if isinstance(pixel_values, torch.Tensor):
+            b, num_patches, c, h, w = pixel_values.shape
+            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+            stacked_image_features = self._image_pixels_to_features(
+                self.vision_tower, stacked_pixel_values)
+            stacked_patch_embeddings = self.multi_modal_projector(
+                stacked_image_features)
 
+            return stacked_patch_embeddings.view(
+                b, num_patches, *stacked_patch_embeddings.shape[1:])
+
+        num_patches_per_batch = [v.shape[0] for v in pixel_values]
+        stacked_pixel_values = torch.cat(pixel_values)
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return stacked_image_features.view(b, num_patches,
-                                           *stacked_image_features.shape[-2:])
+        return [
+            self.multi_modal_projector(image_features) for image_features in
+            torch.split(stacked_image_features, num_patches_per_batch)
+        ]
 
     def _process_image_input(
-            self, image_input: LlavaNextImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
-
-        patch_embeddings = self.multi_modal_projector(image_features)
+            self, image_input: LlavaNextImageInputs) -> BatchedTensors:
+        patch_embeddings = self._process_image_pixels(image_input)
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:
-            batch_size = image_input["data"].shape[0]
+            batch_size = len(image_input["data"])
             vision_config = self.config.vision_config
-            default_width = default_height = vision_config.image_size
-            image_sizes = torch.as_tensor([[default_width, default_height]
+            default_height = default_width = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_height, default_width]
                                            for _ in range(batch_size)])
 
-        merged_patch_embeddings = [
+        return [
             self._merge_image_patch_embeddings(image_sizes[i],
-                                               patch_features,
+                                               patch_features_batch,
                                                strategy="spatial_unpad")
-            for i, patch_features in enumerate(patch_embeddings)
+            for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-        return torch.stack(merged_patch_embeddings, dim=0)
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: object,
     ) -> SamplerOutput:
         """Run forward pass for LlaVA-NeXT.
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
+
         Concretely, consider a text prompt:
-        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        `"A chat between a curious human and an artificial intelligence
+        assistant. The assistant gives helpful, detailed, and polite answers to
+        the human's questions.
+        USER: <image>\\nWhat is shown in this image? ASSISTANT:"`.
+
         Tokenizer outputs:
-        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
-        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
-        The to-be-inserted image has a size of 576 (24 * 24) along the context
-        length dimension.
-        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
-        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
-        9047, 13566, 29901].
-        There will be 576 `32000` in the `input_ids`.
-        (32000 is the token id for `<image>`.)
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
+        9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
+        319, 1799, 9047, 13566, 29901]`.
+
+        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
+        model depends on the original size of the input image. Including the
+        original image token in the input, the required number of image tokens
+        is given by :func:`get_llava_next_image_feature_size`.
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -368,15 +491,10 @@ def forward(
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each grid patch for each input image.
-                Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
-            image_sizes: The original `(width, height)` for each input image.
-                Expects a batch with shape `[1, 2]`.
-
+            image_sizes: The original `(height, width)` for each input image.
+        
         See also:
-            Each input maps to huggingface implementation, as follows:
-
-            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
-            - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
+            :class:`LlavaNextImageInputs`
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
@@ -386,7 +504,7 @@ def forward(
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.config.image_token_index)
 
             input_ids = None
         else:
@@ -396,13 +514,14 @@ def forward(
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -451,7 +570,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     use_default_weight_loading = True
-            if use_default_weight_loading:
+            if use_default_weight_loading and name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
new file mode 100644
index 0000000000000..6453d0cb25c91
--- /dev/null
+++ b/vllm/model_executor/models/medusa.py
@@ -0,0 +1,159 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, hidden_size: int, num_layers: int) -> None:
+        super().__init__()
+
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size, bias=False)
+            for _ in range(num_layers)
+        ])
+        self.act = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = x + self.act(layer(x))
+        return x
+
+
+class Medusa(nn.Module):
+
+    def __init__(self, config: MedusaConfig, **_) -> None:
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList([
+            ResidualBlock(hidden_size=self.config.hidden_size,
+                          num_layers=self.config.num_hidden_layers)
+            for _ in range(self.config.num_heads)
+        ])
+        self.orig_vocab_size = config.vocab_size
+        self.truncated_vocab_size = config.truncated_vocab_size
+        self.unpadded_vocab_size = self.truncated_vocab_size
+
+        self.lm_heads = nn.ModuleList([
+            ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.truncated_vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            ) for _ in range(self.config.num_heads)
+        ])
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.truncated_vocab_size,
+                                                logit_scale)
+
+        self.token_map = None
+
+    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
+        return [block(hidden_states) for block in self.blocks]
+
+    def compute_logits(
+            self, hidden_states: List[torch.Tensor],
+            sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
+        logits = []
+
+        for hs, lm_head in zip(hidden_states, self.lm_heads):
+            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
+
+            if self.token_map is None:
+                logits.append(_logits)
+            else:
+                logits.append(-torch.inf * torch.ones(
+                    size=(*_logits.shape[:-1], self.orig_vocab_size),
+                    device=_logits.device,
+                    dtype=_logits.dtype))
+
+                logits[-1][..., self.token_map] = _logits
+
+        return logits
+
+    def sample(
+        self,
+        logits: List[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        logits = torch.stack(logits, dim=0).float()
+        logprobs = torch.log_softmax(logits, dim=-1)
+        token_ids = logits.argmax(-1)  # support only top-1 for now
+        probs = torch.softmax(logits, dim=-1)
+
+        token_id_list = []
+        token_prob_list = []
+        token_logprob_list = []
+
+        for idx, seq_group in enumerate(sampling_metadata.seq_groups):
+            token_id_list.append(token_ids[:, seq_group.sample_indices])
+            token_prob_list.append(probs[:, seq_group.sample_indices])
+            token_logprob_list.append(logprobs[:, seq_group.sample_indices])
+
+        outputs: List[Optional[SamplerOutput]] = []
+        for idx in range(len(sampling_metadata.seq_groups)):
+            outputs.append(
+                SamplerOutput(
+                    outputs=None,
+                    sampled_token_probs=token_prob_list[idx].squeeze(1),
+                    logprobs=token_logprob_list[idx].squeeze(1),
+                    sampled_token_ids=token_id_list[idx].squeeze(1),
+                ))
+
+        return outputs
+
+    def generate_proposals(
+        self,
+        previous_hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        return self.sample(
+            logits=self.compute_logits(
+                hidden_states=self.forward(previous_hidden_states),
+                sampling_metadata=sampling_metadata,
+            ),
+            sampling_metadata=sampling_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        weights_map = {}
+
+        for name, loaded_weight in weights:
+            name = name.replace("medusa_heads.", "")
+
+            if name == "token_map":
+                if self.truncated_vocab_size < self.orig_vocab_size:
+                    self.token_map = nn.Parameter(loaded_weight,
+                                                  requires_grad=False)
+            elif name in params_dict:
+                weights_map[name] = loaded_weight
+
+        for name, loaded_weight in weights_map.items():
+            if "lm_head" in name and self.token_map is not None and\
+                loaded_weight.shape[0] > self.token_map.shape[0]:
+
+                loaded_weight = loaded_weight[self.token_map]
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        if self.token_map is not None:
+            self.token_map.to(device=self.lm_heads[0].weight.device)
+
+        assert (self.truncated_vocab_size
+                == self.orig_vocab_size) or (self.token_map is not None)
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 59fbf8e1b35f2..4ccf1cf0fad76 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -26,6 +26,7 @@
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -49,7 +50,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 class MiniCPMMoE(nn.Module):
@@ -388,7 +391,7 @@ def forward(
         return hidden_states
 
 
-class MiniCPMForCausalLM(nn.Module):
+class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -418,13 +421,16 @@ class MiniCPMForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.num_experts = getattr(self.config, "num_experts", 0)
         self.quant_config = quant_config
         self.model = MiniCPMModel(config,
@@ -443,6 +449,7 @@ def __init__(
                 # We need bigger padding if using lora for kernel
                 # compatibility
                 if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
             )
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
@@ -456,6 +463,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -465,10 +473,10 @@ def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         hidden_states = hidden_states / self.scale_width
         if self.config.tie_word_embeddings:
-            lm_head_weight = self.model.embed_tokens.weight
+            lm_head = self.model.embed_tokens
         else:
-            lm_head_weight = self.lm_head.weight
-        logits = self.logits_processor(lm_head_weight, hidden_states,
+            lm_head = self.lm_head
+        logits = self.logits_processor(lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3faf54d292b99..8fbd537a2c031 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -27,13 +27,10 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm import _custom_ops as ops
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -41,18 +38,17 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
-                                                         per_tensor_dequantize,
-                                                         per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
-from vllm.utils import print_warning_once
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
 
 
 class MixtralMoE(nn.Module):
@@ -64,228 +60,45 @@ class MixtralMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
         super().__init__()
-        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
-        self.num_total_experts = num_experts
-        self.top_k = top_k
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size // self.tp_size
-        self.quant_config = quant_config
-
-        # FIXME(pcmoritz): Make this more general to support different
-        # quantization schemes
-        self.use_fp8 = isinstance(quant_config, Fp8Config)
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(self.hidden_size,
-                                     self.num_total_experts,
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
                                      bias=False,
-                                     params_dtype=self.params_dtype,
-                                     quant_config=None)
-
-        if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.float8_e4m3fn
-
-        self.w13_weight = nn.Parameter(torch.empty(self.num_total_experts,
-                                                   2 * self.intermediate_size,
-                                                   self.hidden_size,
-                                                   dtype=params_dtype),
-                                       requires_grad=False)
-        self.w2_weight = nn.Parameter(torch.empty(self.num_total_experts,
-                                                  self.hidden_size,
-                                                  self.intermediate_size,
-                                                  dtype=params_dtype),
-                                      requires_grad=False)
-
-        set_weight_attrs(self.w13_weight, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2_weight, {
-            "weight_loader": self.weight_loader,
-        })
-
-        # Used for fp8.
-        self.w13_scale = None
-        self.w2_scale = None
-        self.a13_scale = None
-        self.a2_scale = None
-
-        if self.use_fp8:
-            # WEIGHT_SCALE (for fp8)
-            # Allocate 2 scales for w1 and w3 respectively.
-            # They will be combined to a single scale after weight loading.
-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                     2,
-                                                     dtype=torch.float32),
-                                          requires_grad=False)
-            self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                    dtype=torch.float32),
-                                         requires_grad=False)
-
-            # If loading fp8 checkpoint, pass the weight loaders.
-            # If loading an fp16 checkpoint, do not (we will quantize in
-            #   process_weights_after_loading()
-            if quant_config.is_checkpoint_fp8_serialized:
-                set_weight_attrs(self.w13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.w2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-
-            # INPUT_SCALE (for fp8)
-            if quant_config.activation_scheme == "static":
-                if not quant_config.is_checkpoint_fp8_serialized:
-                    raise ValueError(
-                        "Found static activation scheme for checkpoint that "
-                        "was not serialized fp8.")
-                self.a13_scale = nn.Parameter(torch.ones(
-                    self.num_total_experts, dtype=torch.float32),
-                                              requires_grad=False)
-                self.a2_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-
-                set_weight_attrs(self.a13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.a2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, expert_id: int):
-        tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.data
-        shard_size = self.intermediate_size
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        if weight_name.endswith("w1.weight"):
-            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w3.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w2.weight"):
-            param_data[expert_id, :, :] = loaded_weight[:, shard]
-
-        # Loading scales
-        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        elif "weight_scale" in weight_name:
-            # We have to keep the weight scales of w1 and w3 because
-            # we need to re-quantize w1/w3 weights after weight loading.
-            assert "w1" in weight_name or "w3" in weight_name
-            shard_id = 0 if "w1" in weight_name else 1
-            param_data[expert_id][shard_id] = loaded_weight
-
-    def process_weights_after_loading(self):
-        # Fp8 is the only case where we need to process after loading.
-        if not self.use_fp8:
-            return
-
-        # If checkpoint is fp16, quantize here.
-        if not self.quant_config.is_checkpoint_fp8_serialized:
-            w13_weight = torch.empty_like(self.w13_weight.data,
-                                          dtype=torch.float8_e4m3fn)
-            w2_weight = torch.empty_like(self.w2_weight.data,
-                                         dtype=torch.float8_e4m3fn)
-
-            # Re-initialize w13_scale because we directly quantize
-            # merged w13 weights and generate a single scaling factor.
-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                     dtype=torch.float32),
-                                          requires_grad=False)
-            for expert in range(self.num_total_experts):
-                w13_weight[expert, :, :], self.w13_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w13_weight.data[expert, :, :])
-                w2_weight[expert, :, :], self.w2_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w2_weight.data[expert, :, :])
-            self.w13_weight = nn.Parameter(w13_weight, requires_grad=False)
-            self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
-
-        else:
-            # If checkpoint is fp8 + static, cleanup input_scales.
-            #   Since state_dict has an input_scale per expert but our kernels
-            #   are passed one input_scale shared across all experts.
-            if self.quant_config.activation_scheme == "static":
-                if self.a13_scale is None or self.a2_scale is None:
-                    raise ValueError(
-                        "QuantConfig has static quantization, but found "
-                        "activation scales are None.")
-
-                if (not all_close_1d(self.a13_scale)
-                        or not all_close_1d(self.a2_scale)):
-                    print_warning_once(
-                        "Found input_scales that are not equal for "
-                        "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
-
-                self.a13_scale = nn.Parameter(self.a13_scale.max(),
-                                              requires_grad=False)
-                self.a2_scale = nn.Parameter(self.a2_scale.max(),
-                                             requires_grad=False)
-
-            assert self.w13_scale is not None
-            shard_size = self.intermediate_size
-            max_w13_scales = self.w13_scale.max(dim=1).values
-            for expert_id in range(self.num_total_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        self.w13_weight[expert_id][start:start +
-                                                   shard_size, :],
-                        self.w13_scale[expert_id][shard_id])
-                    self.w13_weight[expert_id][
-                        start:start + shard_size, :] = per_tensor_quantize(
-                            dq_weight, max_w13_scales[expert_id])
-                    start += shard_size
-
-            self.w13_scale = nn.Parameter(max_w13_scales, requires_grad=False)
+                                     params_dtype=params_dtype,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w13_weight,
-                                        self.w2_weight,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=True,
-                                        inplace=True,
-                                        use_fp8=self.use_fp8,
-                                        w1_scale=self.w13_scale,
-                                        w2_scale=self.w2_scale,
-                                        a1_scale=self.a13_scale,
-                                        a2_scale=self.a2_scale)
-
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_size)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):
@@ -299,6 +112,7 @@ def __init__(
         rope_theta: float = 10000,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -329,12 +143,14 @@ def __init__(
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -372,6 +188,7 @@ def __init__(
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -384,13 +201,15 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
             cache_config=cache_config,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn")
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -433,6 +252,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
@@ -446,12 +266,14 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
-        self.layers = nn.ModuleList([
-            MixtralDecoderLayer(config,
-                                cache_config,
-                                quant_config=quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MixtralDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -460,19 +282,30 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i], attn_metadata,
-                                            residual)
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
-class MixtralForCausalLM(nn.Module):
+class MixtralForCausalLM(nn.Module, SupportsLoRA):
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
@@ -504,11 +337,15 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
-                                  lora_config=lora_config)
+                                  lora_config=lora_config,
+                                  prefix="model")
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -520,6 +357,7 @@ def __init__(
             # We need bigger padding if using lora for kernel
             # compatibility
             if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
@@ -531,17 +369,32 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def sample(
         self,
         logits: Optional[torch.Tensor],
@@ -558,28 +411,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        expert_params_mapping = [
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ] + [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id)
-            ("w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ] + [
-            # These are the activation scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
@@ -593,46 +431,44 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
                                   loaded_weight,
                                   weight_name,
+                                  shard_id=shard_id,
                                   expert_id=expert_id)
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     # Remapping the name of FP8 kv-scale.
-                    if name.endswith("kv_scale"):
-                        remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
-                        if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
-                            continue
-                        else:
-                            name = remapped_kv_scale_name
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
-
-
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 1894c05e167d6..10faa5cc6b6cc 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -47,7 +47,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class MixtralMLP(nn.Module):
@@ -344,7 +344,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -354,6 +356,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -361,7 +364,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index b18269777cd01..d3aec06a92fdb 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -8,9 +8,12 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs import MLPSpeculatorConfig
+
+SQRT2 = 2**0.5
 
 
 class MLPSpeculatorLayerNorm(nn.Module):
@@ -25,30 +28,36 @@ class MLPSpeculatorLayerNorm(nn.Module):
         Safety term to prevent division by zero. Make sure the chosen value
          fits in the range of your encoding scheme
          (i.e. fp16 requires eps >= 6e-8).
+    elementwise_scale_and_shift : bool
+        Include a learned scaling and shift term after normalization.
     """
 
     def __init__(
         self,
         normalized_shape,
         eps=1e-06,
+        elementwise_scale_and_shift=True,
     ):
         super(MLPSpeculatorLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.empty(normalized_shape))
-        self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.elementwise_scale_and_shift = elementwise_scale_and_shift
+        if self.elementwise_scale_and_shift:
+            self.weight = nn.Parameter(torch.empty(normalized_shape))
+            self.bias = nn.Parameter(torch.empty(normalized_shape))
         self.eps = eps
 
     def forward(self, x):
         xf = x
         xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
         x = xf.type_as(x)
-        x = self.weight * x
-        x = x + self.bias
+        if self.elementwise_scale_and_shift:
+            x = self.weight * x
+            x = x + self.bias
         return x
 
 
 class MLPSpeculator(nn.Module):
 
-    def __init__(self, config, **kwargs) -> None:
+    def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
         super().__init__()
         self.n_predict = config.n_predict
         self.vocab_size = config.vocab_size
@@ -56,30 +65,62 @@ def __init__(self, config, **kwargs) -> None:
         self.inner_dim = config.inner_dim if config.inner_dim != 0 \
             else config.emb_dim
 
-        self.max_speculative_tokens = getattr(config, "max_speculative_tokens",
-                                              self.n_predict)
-
-        self.emb = nn.ModuleList([
-            VocabParallelEmbedding(config.vocab_size,
-                                   self.inner_dim,
-                                   org_num_embeddings=config.vocab_size)
-            for _ in range(self.max_speculative_tokens)
-        ])
-
-        self.proj = nn.ModuleList([
-            nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
-                      self.inner_dim,
-                      bias=False) for i in range(self.max_speculative_tokens)
-        ])
-
-        self.head = nn.ModuleList([
-            nn.Linear(self.inner_dim, self.vocab_size, bias=False)
-            for _ in range(self.max_speculative_tokens)
-        ])
-        self.ln = nn.ModuleList([
-            MLPSpeculatorLayerNorm(self.inner_dim)
-            for _ in range(self.max_speculative_tokens)
-        ])
+        self.max_speculative_tokens = config.num_lookahead_tokens
+
+        self.tie_weights = config.tie_weights
+        self.scale_input = config.scale_input
+
+        if self.tie_weights:
+            assert (
+                self.n_predict >
+                1), "You cannot tie weights between stages when only 1 exists"
+            embedding = VocabParallelEmbedding(
+                config.vocab_size,
+                self.inner_dim,
+                org_num_embeddings=config.vocab_size)
+            self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens)
+
+            # the initial projection from the base model may
+            # have a different size, so that stays separate.
+            proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False)
+            proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False)
+            self.proj = nn.ModuleList([proj_first] + [proj_tied] *
+                                      (self.max_speculative_tokens - 1))
+
+            head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
+            self.head = nn.ModuleList([head] * self.max_speculative_tokens)
+
+            ln = MLPSpeculatorLayerNorm(self.inner_dim,
+                                        elementwise_scale_and_shift=True)
+            self.ln = nn.ModuleList([ln] * self.max_speculative_tokens)
+
+        else:
+            self.emb = nn.ModuleList([
+                VocabParallelEmbedding(config.vocab_size,
+                                       self.inner_dim,
+                                       org_num_embeddings=config.vocab_size)
+                for _ in range(self.max_speculative_tokens)
+            ])
+
+            self.proj = nn.ModuleList([
+                nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
+                          self.inner_dim,
+                          bias=False)
+                for i in range(self.max_speculative_tokens)
+            ])
+
+            self.head = nn.ModuleList([
+                ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
+                for _ in range(self.max_speculative_tokens)
+            ])
+            self.ln = nn.ModuleList([
+                MLPSpeculatorLayerNorm(self.inner_dim,
+                                       elementwise_scale_and_shift=True)
+                for _ in range(self.max_speculative_tokens)
+            ])
+        if self.scale_input:
+            self.ln0 = MLPSpeculatorLayerNorm(
+                self.emb_dim, elementwise_scale_and_shift=False)
 
         self.state_weight = 0.5**(0.5 / config.n_predict)
         self.emb_weight = math.sqrt(
@@ -105,6 +146,9 @@ def generate_proposals(
         # b x 1 x d
         previous_hidden_states = previous_hidden_states.unsqueeze(1)
 
+        if self.scale_input:
+            previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
+
         # b x 1
         last_tokens = input_ids.unsqueeze(1)
 
@@ -125,8 +169,8 @@ def generate_proposals(
             # TODO: not yet supporting top_k_tokens_per_head
             previous_hidden_states = states
 
-            logits = self.logits_processor(self.head[head_index].weight,
-                                           states, sampling_metadata)
+            logits = self.logits_processor(self.head[head_index], states,
+                                           sampling_metadata)
 
             output = self.sampler(logits.flatten(0, 1), sampling_metadata)
             last_tokens = output.sampled_token_ids
@@ -137,7 +181,8 @@ def generate_proposals(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            param = params_dict[name.replace("speculator.", "")]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            param = params_dict.get(name.replace("speculator.", ""))
+            if param is not None:
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 5f9e4d86f3cd8..7d658b39e6794 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -22,7 +22,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 
@@ -263,7 +263,7 @@ def __init__(
         self.quant_config = quant_config
 
         self.transformer = MPTModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -273,6 +273,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -280,7 +281,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 39270f71ec46f..408c0c883a9d0 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -43,7 +43,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OlmoAttention(nn.Module):
@@ -283,15 +283,15 @@ def __init__(self,
         self.config = config
         self.model = OlmoModel(config, cache_config, quant_config)
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
                 config.hidden_size,
                 org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -301,6 +301,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids=input_ids,
@@ -312,7 +313,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 4bf59105dbabb..edc16710c0229 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -39,7 +39,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OPTLearnedPositionalEmbedding(nn.Embedding):
@@ -294,7 +294,7 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = OPTModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.model.decoder.embed_tokens.weight
+        self.lm_head = self.model.decoder.embed_tokens
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -304,6 +304,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -311,7 +312,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 133a10e6bb3e8..8159cc13fba0b 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -26,7 +26,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OrionMLP(nn.Module):
@@ -259,7 +259,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = OrionModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -269,6 +271,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -276,7 +279,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
new file mode 100644
index 0000000000000..8a2bacbd96b67
--- /dev/null
+++ b/vllm/model_executor/models/paligemma.py
@@ -0,0 +1,350 @@
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+from PIL import Image
+from torch import nn
+from transformers import PaliGemmaConfig, SiglipVisionConfig, SiglipVisionModel
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.gemma import GemmaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.model": "language_model",
+}
+
+
+def get_max_paligemma_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+    text_config = hf_config.text_config
+
+    return text_config.num_image_tokens
+
+
+def dummy_seq_data_for_paligemma(
+    hf_config: PaliGemmaConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = hf_config.text_config.num_image_tokens
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_paligemma(
+    hf_config: SiglipVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_paligemma(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+    vision_config = hf_config.vision_config
+
+    seq_data = dummy_seq_data_for_paligemma(
+        hf_config,
+        seq_len,
+        image_token_id=hf_config.image_token_index,
+    )
+
+    mm_data = dummy_image_for_paligemma(vision_config)
+    return seq_data, mm_data
+
+
+def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs):
+
+    """
+    The correct prompt format needs to be:
+    '<image>' * image_feature_size + '<bos>' + prompt + '\n'
+
+    See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
+    """ # noqa
+
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    image_feature_size = hf_config.text_config.num_image_tokens
+    image_token_str = tokenizer.decode(hf_config.image_token_index)
+    bos_token = tokenizer.decode(hf_config.bos_token_id)
+    image_token_str_pad = image_token_str * image_feature_size
+    image_token_ids_pad = [hf_config.image_token_index] * image_feature_size
+
+    orig_prompt = llm_inputs.get("prompt")
+    orig_prompt_ids = llm_inputs.get("prompt_token_ids")
+
+    if orig_prompt is not None and image_token_str in orig_prompt:
+        logger.warning(
+            "The image token '%s' was detected in the prompt and "
+            "will be removed. Please follow the proper prompt format"
+            " documented on HuggingFace.", image_token_str)
+        orig_prompt = orig_prompt.replace(image_token_str, "")
+        orig_prompt_ids.remove(hf_config.image_token_index)
+
+    new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+    new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+class PaliGemmaMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, projection_dim: int):
+        super().__init__()
+
+        self.linear = ColumnParallelLinear(vision_hidden_size,
+                                           projection_dim,
+                                           bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear(image_features)
+        return hidden_states
+
+
+class PaliGemmaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channels, height, width)"""
+
+
+PaliGemmaImageInputs = PaliGemmaImagePixelInputs
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
+class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: PaliGemmaConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO(ywang96): Port over SiglipVisionModel & TP
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+        self.multi_modal_projector = PaliGemmaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            projection_dim=config.vision_config.projection_dim)
+
+        self.quant_config = quant_config
+        self.language_model = GemmaModel(config.text_config, cache_config,
+                                         quant_config)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return PaliGemmaImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_outputs = vision_tower(pixel_values.to(dtype=target_dtype),
+                                     output_hidden_states=True)
+
+        selected_image_features = image_outputs.last_hidden_state
+
+        return selected_image_features
+
+    def _process_image_pixels(
+            self, inputs: PaliGemmaImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["data"]
+
+        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+
+    def _process_image_input(
+            self, image_input: PaliGemmaImageInputs) -> torch.Tensor:
+
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
+
+        return self.multi_modal_projector(image_features)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs: object) -> SamplerOutput:
+
+        parsed_image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if parsed_image_input is not None:
+            vision_embeddings = self._process_image_input(parsed_image_input)
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
+            vision_embeddings = vision_embeddings * (self.config.hidden_size**
+                                                     -0.5)
+
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.config.image_token_index)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            None,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    # Copied from vllm/model_executor/models/gemma.py
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.language_model.embed_tokens,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    # Copied from vllm/model_executor/models/gemma.py
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    # Adapted from vllm/model_executor/models/gemma.py
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, shard_name,
+                     shard_id) in stacked_params_mapping:
+                    if shard_name not in name:
+                        continue
+                    name = name.replace(shard_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # lm_head is not used in vllm as it is tied with
+                    # embed_token. To prevent errors, skip loading
+                    # lm_head.weight.
+                    if "lm_head.weight" in name:
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    use_default_weight_loading = True
+
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
new file mode 100644
index 0000000000000..bc38d4421b79e
--- /dev/null
+++ b/vllm/model_executor/models/persimmon.py
@@ -0,0 +1,333 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only persimmon model compatible with HuggingFace weights."""
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+from transformers.activations import ReLUSquaredActivation
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+
+class PersimmonMLP(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
+                                                  config.intermediate_size,
+                                                  quant_config=quant_config)
+        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
+                                               config.hidden_size,
+                                               quant_config=quant_config)
+        self.act = ReLUSquaredActivation()
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tensor_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+
+        assert (self.head_dim * self.total_num_heads) == self.hidden_size
+        assert self.total_num_heads % tensor_parallel_world_size == 0
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.is_qk_layernorm = config.qk_layernorm
+
+        if self.is_qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(self.head_dim)
+            self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads, self.head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, num_heads, head_dim] -> [seq_length, hidden_size]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # [seq_length, 3 x hidden_size]
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        if self.is_qk_layernorm:
+            # [seq_length, num_heads, head_dim]
+            q = self._split_heads(q)
+            k = self._split_heads(k)
+
+            q = self.q_layernorm(q)
+            k = self.k_layernorm(k)
+
+            q = self._merge_heads(q)
+            k = self._merge_heads(k)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(config=config,
+                                            cache_config=cache_config,
+                                            quant_config=quant_config)
+        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = hidden_states
+        return outputs
+
+
+class PersimmonModel(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.layers = nn.ModuleList([
+            PersimmonDecoderLayer(config,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            hidden_states = self.layers[i](
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+            )
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class PersimmonForCausalLM(nn.Module):
+
+    def __init__(self,
+                 config,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.model = PersimmonModel(config,
+                                    cache_config=cache_config,
+                                    quant_config=quant_config)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      bias=False)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index c8e61735a9bb6..ac7496f68fd99 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -39,7 +39,7 @@
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import PhiConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -57,13 +57,15 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 class PhiAttention(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -131,7 +133,7 @@ def forward(
 class PhiMLP(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
@@ -160,7 +162,7 @@ def forward(self, hidden_states):
 class PhiLayer(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -192,7 +194,7 @@ def forward(
 class PhiModel(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -229,7 +231,7 @@ def forward(
         return hidden_states
 
 
-class PhiForCausalLM(nn.Module):
+class PhiForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -250,21 +252,24 @@ class PhiForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: PhiConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = PhiModel(config, cache_config, quant_config)
 
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
-                                      bias=True)
+                                      bias=True,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -274,6 +279,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -282,7 +288,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 0c5298eb6f100..cc06929fefab4 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -21,7 +21,7 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 def load_column_parallel_weight(param: torch.nn.Parameter,
@@ -366,6 +366,7 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -400,7 +401,7 @@ def get_decoder(self):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         if self.dummy_token_indices is not None and logits is not None:
             logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
@@ -412,6 +413,7 @@ def forward(
         positions: Optional[torch.LongTensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         output_hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa20a7c5903d6..75e2f5fc95cb7 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,14 +13,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+import re
+from functools import lru_cache
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -29,16 +35,29 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
-from vllm.sequence import SamplerOutput
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   input_processor_for_clip)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
 
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
 
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 32044
+
+# Result in the max possible feature size (h:w = 16:1)
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
+
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
                                                      hidden_act="quick_gelu",
                                                      hidden_size=1024,
@@ -53,28 +72,19 @@
 
 class Phi3ImageEmbeddingBase(nn.Module):
 
-    def __init__(self, wte=None) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.wte = wte
         self.layer_idx: int
         self.type_feature: str
         self.img_processor: CLIPVisionModel
 
-    def set_img_features(self, img_features: torch.FloatTensor) -> None:
-        self.img_features = img_features
-
-    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
-        self.img_sizes = img_sizes
-
     def get_img_features(self,
                          img_embeds: torch.FloatTensor) -> torch.FloatTensor:
-        LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the img_processor
-        img_feature = self.img_processor(img_embeds,
-                                         vision_feature_layer=LAYER_IDX)
+        img_feature = self.img_processor(img_embeds)
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]
@@ -90,24 +100,29 @@ def get_img_features(self,
 class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
     """Phi3 Image embedding with HD transform."""
 
-    def __init__(self,
-                 vision_language_config: VisionLanguageConfig,
-                 config: PretrainedConfig,
-                 wte=None) -> None:
-        super().__init__(wte)
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
 
-        self.image_token_id = vision_language_config.image_token_id
         # n_embed or hidden_size
         hidden_size = config.n_embd if hasattr(
             config, 'n_embd') else config.hidden_size
 
         clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-        self.img_processor = CLIPVisionModel(clip_config)
+        self.layer_idx = config.img_processor.get('layer_idx', -2)
+
+        # Initialize the CLIP only up to the required feature layer
+        if self.layer_idx < 0:
+            num_hidden_layers = clip_config.num_hidden_layers + \
+                self.layer_idx + 1
+        else:
+            num_hidden_layers = self.layer_idx + 1
+
+        self.img_processor = CLIPVisionModel(
+            clip_config, num_hidden_layers_override=num_hidden_layers)
         image_dim_out = config.img_processor['image_dim_out']
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
         self.image_dim_out = image_dim_out
-        self.img_sizes = None
 
         # global_gn and sub_gn for hd transform, serves as line separator
         self.use_hd_transform = config.embd_layer.get('use_hd_transform',
@@ -133,188 +148,390 @@ def __init__(self,
                  nn.Linear(dim_projection, dim_projection)])
         self.img_projection = nn.Sequential(*layers)
 
-        self.vocab_size = config.vocab_size
-        self.img_features = None
-
-        self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
 
-    def forward(self,
-                input_ids: torch.LongTensor,
-                pixel_values: torch.FloatTensor,
-                image_sizes=None) -> torch.FloatTensor:
-        """process and merge text embeddings with image embeddings."""
-
-        img_embeds = pixel_values
-        img_sizes = image_sizes
-
-        if self.img_features is not None:
-            img_embeds = self.img_features.clone()
-            self.img_features = None
-
-        if self.img_sizes is not None:
-            img_sizes = self.img_sizes
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        positions = torch.nonzero(input_ids == self.image_token_id)
-
-        select = False
-
-        target_device = self.img_projection[0].bias.device
-        target_dtype = self.img_projection[0].bias.dtype
-
-        if len(positions.tolist()) > 0:
-            # if self.use_hd_transform and img_sizes:
-            # img_embeds: (num_images, max_num_crops, 3, H, W)
-            # img_sizes: (num_images, 2).view(1, -1)
-
-            bs = img_embeds.shape[0]
-            # Nx(HW)xC
-            img_features = self.get_img_features(img_embeds.flatten(0, 1))
-            base_feat_height = base_feat_width = int(
-                img_features.shape[1]**0.5)
-
-            # bs x max_num_crops x (24x24) x C
-            img_features = img_features.view(
-                bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
-            C = self.image_dim_out
-            H = base_feat_height
-
-            output_imgs = []
-            output_len = []
-
-            if isinstance(img_sizes, torch.Tensor):
-                img_sizes.squeeze_(0)
-
-            for _bs in range(bs):
-                h, w = img_sizes
-                h = h // 336
-                w = w // 336
-                B_ = h * w
-
-                # 1 x (24x24) x 1024
-                global_img_feature = img_features[_bs, :1]
-
-                # 1 x 12 x 12 x 4096
-                glb_img = global_img_feature \
-                    .reshape(1, H // 2, 2, H // 2, 2,C) \
-                    .permute(0, 1, 3, 2, 4, 5) \
-                    .reshape(1, H // 2, H // 2, 4 * C)
-                temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
-
-                # 1 x 156 x 4096
-                glb_img = torch.cat([glb_img, temp_glb_GN],
-                                    dim=2).reshape(1, -1, 4 * C)
-
-                # (max_num_crops-1) x (12x12) x C
-                sub_img = img_features[_bs, 1:]
-                # 16x574x1024
-                # get rid of padding sub_img
-                sub_img = sub_img[:B_]
-
-                sub_img = sub_img.reshape(B_, H // 2, 2, H // 2, 2, C) \
-                    .permute(0, 1, 3, 2, 4, 5).reshape(B_, -1, 4 * C)
-                sub_img = sub_img.reshape(1, h, w, 12, 12, -1) \
-                    .permute(0, 1, 3, 2, 4, 5) \
-                    .reshape(1, h * 12, w * 12, 4 * C)
-                temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
-                sub_img = torch.cat([sub_img, temp_sub_GN],
-                                    dim=2).reshape(1, -1, 4 * C)
-                # (1, num_img_tokens, 1024*4)
-
-                # glb + sub
-                if self.hd_transform_order == 'glb_sub':
-                    output_imgs.append(
-                        torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
-                elif self.hd_transform_order == 'sub_glb':
-                    output_imgs.append(
-                        torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
-
-                temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
-                output_len.append(temp_len)
-
-            num_img_tokens = output_len
-            img_set_tensor = []
-            for _output_img in output_imgs:
-                img_feature_proj = self.img_projection(
-                    _output_img.to(target_device, target_dtype))
-                img_set_tensor.append(img_feature_proj)
-            select = True
-
-        input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-
-        hidden_states = self.wte(input_ids)
-
-        if select:
-            idx = 0
-            for i, cnt in enumerate(num_img_tokens):
-                hidden_states[positions[idx, 0],
-                              positions[idx, 1]:positions[idx, 1] +
-                              cnt] = (img_set_tensor[i].to(
-                                  hidden_states.device, hidden_states.dtype))
-                idx += cnt
-
-        return hidden_states.squeeze(0)
+    def forward(self, pixel_values: torch.FloatTensor,
+                image_sizes: torch.Tensor) -> torch.FloatTensor:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+        num_images, num_crops, c, h, w = pixel_values.shape
+        pixel_values = pixel_values.flatten(0, 1)
+        img_features = self.get_img_features(pixel_values)
+        img_features = img_features.reshape(num_images, num_crops, -1,
+                                            self.image_dim_out)
+        image_features_proj = self.hd_feature_transform(
+            img_features, image_sizes)
+        return image_features_proj
+
+    def hd_feature_transform(self, image_features, image_sizes):
+        """
+        image_features: (num_images, num_crops+1, 24*24, 1024)
+        """
+        assert (
+            self.hd_transform_order == 'sub_glb'
+        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+
+        global_image_features = image_features[:,
+                                               0]  # (num_images, 24*24, 1024)
+        # global feature can be viewed as a special HD case with num_crops 1x1
+        global_image_features_hd = self.reshape_hd_patches_2x2merge(
+            global_image_features, 1, 1)
+        global_image_features_hd_newline = self.add_image_newline(
+            global_image_features_hd)
+
+        all_image_embeddings = []
+        # need a for loop to process each image because of different image sizes
+        # (patch arrangement is different for each image)
+        for i, img_size in enumerate(image_sizes):
+            h, w = img_size
+            h_crop = h // 336
+            w_crop = w // 336
+            num_crops = h_crop * w_crop
+
+            # NOTE: real num_crops is padded
+            # (num_crops, 24*24, 1024)
+            sub_image_features = image_features[i, 1:1 + num_crops]
+            sub_image_features_hd = self.reshape_hd_patches_2x2merge(
+                sub_image_features, h_crop, w_crop)
+            sub_image_features_hd_newline = self.add_image_newline(
+                sub_image_features_hd)
+
+            # [sub features, separator, global features]
+            all_image_embeddings.append(
+                torch.cat([
+                    sub_image_features_hd_newline.squeeze(
+                        0),  # (h_crop*12*(w_crop*12+1), 4096)
+                    self.glb_GN.squeeze(0),
+                    global_image_features_hd_newline[i],
+                ]))
+
+        image_features_proj = self.img_projection(
+            torch.stack(all_image_embeddings).to(target_device, target_dtype)
+        )  # (num_images, (h_crop*12*(w_crop*12+1)+1), hidden_size)
+
+        return image_features_proj
+
+    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
+        """
+        image_features: (num_images*num_crops, 24*24, 1024)
+        output: (num_images, h_crop*12, w_crop*12, 4096)
+        where h_crop*w_crop == num_crops
+        """
+        N, L, C = image_features.shape
+        assert L == 576 and C == 1024 and N % (h_crop * w_crop) == 0
+        num_images = N // (h_crop * w_crop)
+        H = int(L**0.5)
+        image_features_hd = (
+            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+            .reshape(N, -1, 4 * C)  # N, 144, 4096
+            .reshape(num_images, h_crop, w_crop, H // 2, H // 2,
+                     -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+            .reshape(num_images, h_crop * H // 2, w_crop * H // 2,
+                     4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+        )
+        return image_features_hd
+
+    def add_image_newline(self, image_features_hd):
+        """
+        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
+        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
+        """
+        num_images, h, w, hid_dim = image_features_hd.shape
+        # add the newline token to the HD image feature patches
+        newline_embeddings = self.sub_GN.expand(num_images, h, -1,
+                                                -1)  # (n_img, h, 1, hid_dim)
+        image_features_hd_newline = torch.cat(
+            [image_features_hd, newline_embeddings],
+            dim=2).reshape(num_images, -1, hid_dim)
+        return image_features_hd_newline
 
 
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    """
 
     image_sizes: torch.Tensor
-    """Shape: (batch_size, 2)"""
+    """
+    Shape: `(batch_size, 2)`
 
+    This should be in `(height, width)` format.
+    """
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class Phi3VForCausalLM(VisionLanguageModelBase):
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57
+def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    padded_width = width
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90
+def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+
+    padded_width, padded_height = _calc_padded_size(width=new_width,
+                                                    height=new_height)
+
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+
+    return padded_width, padded_height
+
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
+def get_phi3v_image_feature_size(
+    hf_config: PretrainedConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    num_crops = getattr(hf_config, "num_crops", 16)
+    new_width, new_height = _calc_hd_transform_size(width=input_width,
+                                                    height=input_height,
+                                                    hd_num=num_crops)
+
+    return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \
+        + (new_height // 336 + 1) * 12
+
+
+def get_max_phi3v_image_tokens(ctx: InputContext):
+
+    return get_phi3v_image_feature_size(
+        ctx.get_hf_config(PretrainedConfig),
+        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+    )
+
+
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+
+    image_feature_size = get_max_phi3v_image_tokens(ctx)
+
+    seq_data = dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=_IMAGE_TOKEN_ID,
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_image_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+    )
+
+    return seq_data, mm_data
+
+
+# Reserve this function to also handle placeholders for additional images
+# [ref: PR #5820]
+@lru_cache
+def _get_image_placeholder_token_ids(model_config: ModelConfig,
+                                     idx: int) -> List[int]:
+    assert idx > 0
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    # We need to get the token for "<", not "▁<"
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
+    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
+    a_token_id_, *image_placeholder_token_ids = tokenizer.encode(
+        f"a<|image_{idx}|>", add_special_tokens=False)
+    assert a_token_id == a_token_id_
+
+    return image_placeholder_token_ids
+
+
+def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        w, h = image_data.size
+        w, h = _calc_hd_transform_size(width=w, height=h)
+
+        image_feature_size = get_phi3v_image_feature_size(hf_config,
+                                                          input_width=w,
+                                                          input_height=h)
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    prompt = llm_inputs.get("prompt")
+    if prompt is None:
+        new_prompt = None
+    else:
+        if prompt.count("<|image|>") > 0:
+            logger.warning("Please follow the prompt format that is "
+                           "documented on HuggingFace which does not involve "
+                           "repeating <|image|> tokens.")
+        elif len(re.findall(r"(<\|image_\d+\|>)+", prompt)) > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        new_prompt = prompt
+
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    image_1_token_ids = _get_image_placeholder_token_ids(model_config, idx=1)
+
+    new_token_ids: List[int] = []
+    for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):
+        if prompt_token_ids[i:i + len(image_1_token_ids)] == image_1_token_ids:
+            new_token_ids.append(_IMAGE_TOKEN_ID)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + len(image_1_token_ids):])
+            break
+        else:
+            new_token_ids.append(prompt_token_ids[i])
+
+    # NOTE: Create a defensive copy of the original inputs
+    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
+                           prompt=new_prompt,
+                           multi_modal_data=multi_modal_data)
+
+    return input_processor_for_clip(
+        model_config,
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        llm_inputs,
+        image_token_id=_IMAGE_TOKEN_ID,
+        image_feature_size_override=image_feature_size,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
+class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
+
         self.config = config
+        self.multimodal_config = multimodal_config
+        self.image_token_id = _IMAGE_TOKEN_ID
+
         self.model = LlamaModel(config, cache_config, quant_config)
-        self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vision_language_config, config, self.model.embed_tokens)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_embed_tokens = Phi3HDImageEmbedding(config)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected shape of image sizes is batch dimension plus "
+                f"{[2]}. You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
+        if pixel_values is None:
+            return None
 
-        if expected_input_type != ImageInputType.PIXEL_VALUES:
-            raise ValueError(
-                f"Unexpected image input type: {expected_input_type}."
-                "Phi3v only support pixel_values input currently.")
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-        if pixel_values is not None and image_sizes is not None:
-            return Phi3VImagePixelInputs(type="pixel_values",
-                                         data=pixel_values,
-                                         image_sizes=image_sizes)
+        if not isinstance(image_sizes, torch.Tensor):
+            raise ValueError("Incorrect type of image sizes. "
+                             f"Got type: {type(image_sizes)}")
 
-        return None
+        return Phi3VImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+            image_sizes=self._validate_image_sizes(image_sizes))
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata, **kwargs: object):
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs: object):
         image_input = self._parse_and_validate_image_input(**kwargs)
 
         if image_input is not None:
-            inputs_embeds = self.vision_embed_tokens(
-                input_ids, image_input["data"], image_input["image_sizes"])
-
+            vision_embeddings = self.vision_embed_tokens(
+                image_input["data"], image_input["image_sizes"])
+            inputs_embeds = self.model.get_input_embeddings(input_ids)
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vision_embeddings,
+                                                    self.image_token_id)
             input_ids = None
         else:
             inputs_embeds = None
@@ -323,13 +540,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                                    positions,
                                    kv_caches,
                                    attn_metadata,
+                                   intermediate_tensors,
                                    inputs_embeds=inputs_embeds)
 
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -375,7 +593,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index d22ea6b79de0f..47c85c783db7a 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -27,7 +27,8 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class QWenMLP(nn.Module):
@@ -234,7 +235,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.transformer = QWenModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -244,6 +247,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
@@ -251,7 +255,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -288,6 +292,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip loading visual weights to support Qwen-VL models
+                # in cases with text-only inputs
+                # TODO: add support for Qwen-VL
+                if (name not in params_dict
+                        and name.startswith("transformer.visual.")):
+                    print_warning_once(
+                        "Only text inputs are allowed. Images won't be handled "
+                        "until Qwen-VL models are fully supported.")
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b5d13bb6b937c..e9aa4416eded4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -43,10 +43,12 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
-from vllm.utils import print_warning_once
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 class Qwen2MLP(nn.Module):
@@ -263,7 +265,7 @@ def forward(
         return hidden_states
 
 
-class Qwen2ForCausalLM(nn.Module):
+class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -293,7 +295,6 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -307,16 +308,19 @@ def __init__(
                              ))
 
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = Qwen2Model(config, cache_config, quant_config)
 
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size)
-            self.lm_head_weight = self.lm_head.weight
+                                          config.hidden_size,
+                                          quant_config=quant_config)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -327,6 +331,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -334,7 +339,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -377,18 +382,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 # Remapping the name of FP8 kv-scale.
-                if name.endswith("kv_scale"):
-                    remapped_kv_scale_name = name.replace(
-                        ".kv_scale", ".attn.kv_scale")
-                    if remapped_kv_scale_name not in params_dict:
-                        print_warning_once(
-                            f"Found kv scale in the checkpoint (e.g. {name}), "
-                            "but not found the expected name in the model "
-                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
-                            "not loaded.")
-                        continue
-                    else:
-                        name = remapped_kv_scale_name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 564536f2dd248..2cc2f1440d147 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -31,11 +31,10 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -50,7 +49,8 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class Qwen2MoeMLP(nn.Module):
@@ -93,28 +93,23 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.num_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.n_routed_experts:
+
+        if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}.")
-
-        self.experts = nn.ModuleList([
-            Qwen2MoeMLP(hidden_size=config.hidden_size,
-                        intermediate_size=config.moe_intermediate_size,
-                        hidden_act=config.hidden_act,
-                        quant_config=quant_config,
-                        reduce_results=False)
-            for idx in range(self.n_routed_experts)
-        ])
-        self.pack_params()
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config)
 
         self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.n_routed_experts,
+                                     config.num_experts,
                                      bias=False,
                                      quant_config=None)
         if config.shared_expert_intermediate_size > 0:
@@ -131,27 +126,10 @@ def __init__(
                                                   1,
                                                   bias=False)
 
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
         shared_output = None
         if self.shared_expert is not None:
@@ -162,20 +140,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w1,
-                                        self.w2,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=self.config.norm_topk_prob,
-                                        inplace=True)
-
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
-        return final_hidden_states.view(num_tokens, hidden_dim)
+        return final_hidden_states.view(orig_shape)
 
 
 class Qwen2MoeAttention(nn.Module):
@@ -284,7 +257,12 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
         )
-        if (layer_idx not in config.mlp_only_layers) and (
+
+        # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
+        # `mlp_only_layers` in the config.
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen2MoeSparseMoeBlock(config=config,
@@ -387,7 +365,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = Qwen2MoeModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -397,6 +377,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -404,7 +385,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
@@ -426,21 +407,34 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
                 if name not in params_dict:
                     continue
 
@@ -449,17 +443,39 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
-                if name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a6ed3800bed0f..5451b56ed05f7 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -41,7 +41,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class StablelmMLP(nn.Module):
@@ -240,7 +240,9 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = StableLMEpochModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -250,6 +252,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -257,7 +260,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 4324bf50d4ad1..1752bfd473b88 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -40,7 +40,7 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class Starcoder2Attention(nn.Module):
@@ -242,7 +242,7 @@ def __init__(self,
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
@@ -250,8 +250,8 @@ def __init__(self,
                 config.hidden_size,
                 org_num_embeddings=config.vocab_size,
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
         self.sampler = Sampler()
@@ -262,6 +262,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -269,7 +270,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
new file mode 100644
index 0000000000000..197d3839a766a
--- /dev/null
+++ b/vllm/model_executor/models/utils.py
@@ -0,0 +1,178 @@
+from typing import Dict, List, Protocol, Tuple
+
+import torch
+from torch.func import functional_call
+
+from vllm.multimodal import BatchedTensors
+from vllm.utils import is_pin_memory_available
+
+
+def merge_vision_embeddings(input_ids: torch.Tensor,
+                            inputs_embeds: torch.Tensor,
+                            vision_embeddings: BatchedTensors,
+                            image_token_id: int) -> torch.Tensor:
+    """
+    Merge `vision_embeddings` into `inputs_embeds` by overwriting the positions
+    in `inputs_embeds` corresponding to placeholder image tokens in `input_ids`.
+
+    Note:
+        This updates `inputs_embeds` in place.
+    """
+    mask = (input_ids == image_token_id)
+    num_expected_tokens = mask.sum()
+
+    if isinstance(vision_embeddings, torch.Tensor):
+        batch_size, batch_tokens, *_, embed_dim = vision_embeddings.shape
+        total_tokens = batch_size * batch_tokens
+        if num_expected_tokens != total_tokens:
+            expr = f"{batch_size} x {batch_tokens}"
+            raise ValueError(
+                f"Attempted to assign {expr} = {total_tokens} "
+                f"image tokens to {num_expected_tokens} placeholders")
+
+        inputs_embeds[mask] = vision_embeddings.view(total_tokens, embed_dim)
+    else:
+        size_per_batch = [t.shape[0] for t in vision_embeddings]
+        total_tokens = sum(size_per_batch)
+        if num_expected_tokens != total_tokens:
+            expr = ' + '.join(map(str, size_per_batch))
+            raise ValueError(
+                f"Attempted to assign {expr} = {total_tokens} "
+                f"image tokens to {num_expected_tokens} placeholders")
+
+        inputs_embeds[mask] = torch.cat(vision_embeddings)
+
+    return inputs_embeds
+
+
+class LayerFn(Protocol):
+
+    def __call__(
+        self,
+        prefix="",
+    ) -> torch.nn.Module:
+        ...
+
+
+class PPMissingLayer(torch.nn.Identity):
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+
+_CPU_OFFLOAD_BYTES = 0
+_CPU_OFFLOAD_MAX_BYTES = 0
+
+
+def set_cpu_offload_max_bytes(max_bytes: int) -> None:
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    _CPU_OFFLOAD_BYTES = 0
+    _CPU_OFFLOAD_MAX_BYTES = max_bytes
+
+
+def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
+    device = next(module.parameters()).device
+
+    if device == torch.device("cpu"):
+        return module
+
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+        return module
+
+    pin_memory = is_pin_memory_available()
+
+    # offload parameters to CPU
+    # use pin_memory if possible, which helps cudagraph capture speed
+    for p in module.parameters():
+        if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+            # we use per-parameter offloading
+            # one module might have some parameters offloaded and some not
+            break
+
+        # `torch.empty_like` does not support `pin_memory` argument
+        cpu_data = torch.empty(size=p.data.size(),
+                               dtype=p.data.dtype,
+                               layout=p.data.layout,
+                               device='cpu',
+                               pin_memory=pin_memory)
+        cpu_data.copy_(p.data)
+        p.data = cpu_data
+        _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+
+    state_dict: Dict[str, torch.Tensor] = module.state_dict()
+
+    original_forward = module.forward
+
+    def forward(*args, **kwargs):
+        module.forward = original_forward
+        device_state = {
+            # here we blindly call `to(device)`
+            # if the parameter is already on the device, it will be a no-op
+            k: v.to(device, non_blocking=True)
+            for k, v in state_dict.items()
+        }
+        output = functional_call(module,
+                                 device_state,
+                                 args=args,
+                                 kwargs=kwargs)
+        module.forward = forward
+        return output
+
+    module.forward = forward
+
+    return module
+
+
+def make_layers(
+    num_hidden_layers: int,
+    layer_fn: LayerFn,
+    prefix: str,
+) -> Tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function, taking
+    pipeline parallelism into account.
+    """
+    from vllm.distributed.parallel_state import get_pp_group
+    from vllm.distributed.utils import get_pp_indices
+    start_layer, end_layer = get_pp_indices(num_hidden_layers,
+                                            get_pp_group().rank_in_group,
+                                            get_pp_group().world_size)
+    modules = torch.nn.ModuleList(
+        [PPMissingLayer() for _ in range(start_layer)] + [
+            maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
+            for idx in range(start_layer, end_layer)
+        ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+    return start_layer, end_layer, modules
+
+
+# NOTE: don't use lru_cache here because it can prevent garbage collection
+_model_to_pp_missing_layer_names: Dict[int, List[str]] = {}
+
+
+def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
+    """Get the names of the missing layers in a pipeline parallel model."""
+    model_id = id(model)
+    if model_id in _model_to_pp_missing_layer_names:
+        return _model_to_pp_missing_layer_names[model_id]
+
+    missing_layer_names = []
+    for name, module in model.named_modules():
+        if isinstance(module, PPMissingLayer):
+            # NOTE: the trailing dot is used to match the prefix of the layer.
+            # without the dot, we could match a layer that is not missing,
+            # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
+            missing_layer_names.append(name + '.')
+    _model_to_pp_missing_layer_names[model_id] = missing_layer_names
+
+    return missing_layer_names
+
+
+def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
+    """Check if a parameter is missing in a pipeline parallel model."""
+    for missing_layer_name in get_pp_missing_layer_names(model):
+        if name.startswith(missing_layer_name):
+            return True
+    return False
diff --git a/vllm/model_executor/models/vlm_base.py b/vllm/model_executor/models/vlm_base.py
deleted file mode 100644
index eb0aa96e50d59..0000000000000
--- a/vllm/model_executor/models/vlm_base.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from torch import nn
-
-from vllm.config import VisionLanguageConfig
-
-
-class VisionLanguageModelBase(nn.Module):
-    """Base class for all vision language models (VLMs)."""
-
-    def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
-        super().__init__()
-
-        self.vision_language_config = vision_language_config
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 1e5280dde3ff9..84f0ffc376d65 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -43,7 +43,9 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
 
 
 class XverseMLP(nn.Module):
@@ -266,7 +268,7 @@ def forward(
         return hidden_states
 
 
-class XverseForCausalLM(nn.Module):
+class XverseForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -299,13 +301,18 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        lora_config=None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = XverseModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -315,6 +322,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
@@ -322,7 +330,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index f95de56f39b57..390b5d173ebcd 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -8,7 +8,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        maybe_expand_dim)
+                        make_tensor_with_pad, maybe_expand_dim)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
@@ -86,6 +86,12 @@ def sample(logits):
             The first tuple is [1, 2] (sampled index within original logit),
             and the second tuple is [0, 1] (sampled index within pruned logit).
         num_prompts: Number of prompt sequence groups in seq_groups.
+        skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU 
+            serialization of token outputs.
+        reuse_sampling_tensors: Indicates if we want to reuse sampling 
+            tensors that are part of the sampler forward pass. Currently,
+            it is mainly used for multi-step decode.
+            
     """
 
     def __init__(
@@ -94,11 +100,15 @@ def __init__(
         selected_token_indices: torch.Tensor,
         categorized_sample_indices: Dict[SamplingType, torch.Tensor],
         num_prompts: int,
+        skip_sampler_cpu_output: bool = False,
+        reuse_sampling_tensors: bool = False,
     ) -> None:
         self.seq_groups = seq_groups
         self.selected_token_indices = selected_token_indices
         self.categorized_sample_indices = categorized_sample_indices
         self.num_prompts = num_prompts
+        self.skip_sampler_cpu_output = skip_sampler_cpu_output
+        self.reuse_sampling_tensors = reuse_sampling_tensors
 
     @staticmethod
     def prepare(
@@ -427,8 +437,8 @@ def from_sampling_metadata(
                 if seq_group.do_sample:
                     for seq_id in seq_ids:
                         seq_data = seq_group.seq_data[seq_id]
-                        prompt_tokens.append(seq_data.prompt_token_ids)
-                        output_tokens.append(seq_data.output_token_ids)
+                        prompt_tokens.append(list(seq_data.prompt_token_ids))
+                        output_tokens.append(list(seq_data.output_token_ids))
 
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
@@ -455,18 +465,24 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         do_penalties = prompt_tokens or output_tokens
 
         if do_penalties:
-            prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
-                                 default=0)
-            prompt_padded_tokens = [
-                tokens + [vocab_size] * (prompt_max_len - len(tokens))
-                for tokens in prompt_tokens
-            ]
-            output_max_len = max([len(tokens) for tokens in output_tokens],
-                                 default=0)
-            output_padded_tokens = [
-                tokens + [vocab_size] * (output_max_len - len(tokens))
-                for tokens in output_tokens
-            ]
+            prompt_t = make_tensor_with_pad(
+                prompt_tokens,
+                vocab_size,
+                device="cpu",
+                dtype=torch.int64,
+                pin_memory=pin_memory,
+            )
+            output_t = make_tensor_with_pad(
+                output_tokens,
+                vocab_size,
+                device="cpu",
+                dtype=torch.int64,
+                pin_memory=pin_memory,
+            )
+        else:
+            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
+            prompt_t = empty_tensor
+            output_t = empty_tensor
 
         temperatures_t = torch.tensor(
             temperatures,
@@ -516,22 +532,6 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             dtype=torch.long,
             pin_memory=pin_memory,
         )
-        if do_penalties:
-            prompt_tensor = torch.tensor(
-                prompt_padded_tokens,
-                device="cpu",
-                dtype=torch.long,
-                pin_memory=pin_memory,
-            )
-            output_tensor = torch.tensor(
-                output_padded_tokens,
-                device="cpu",
-                dtype=torch.long,
-                pin_memory=pin_memory,
-            )
-        else:
-            prompt_tensor = None
-            output_tensor = None
         # need to transpose and make contiguous to
         # copy the tensor correctly.
         # [batch_size, n_seeds] -> [n_seeds, batch_size]
@@ -554,16 +554,6 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             extra_seeds_gpu = None
         sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]
 
-        if do_penalties:
-            prompt_tokens_gpu = prompt_tensor.to(device=device,
-                                                 non_blocking=True)
-            output_tokens_gpu = output_tensor.to(device=device,
-                                                 non_blocking=True)
-        else:
-            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
-            prompt_tokens_gpu = empty_tensor
-            output_tokens_gpu = empty_tensor
-
         return cls(
             temperatures=temperatures_t.to(device=device, non_blocking=True),
             top_ps=top_ps_t.to(device=device, non_blocking=True),
@@ -575,8 +565,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_tokens_gpu,
-            output_tokens=output_tokens_gpu,
+            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
+            output_tokens=output_t.to(device=device, non_blocking=True),
             sampling_seeds=sampling_seeds_gpu,
             sample_indices=sample_indices_t.to(device=device,
                                                non_blocking=True),
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 270012e7d1c3b..503dceab5b168 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,7 +1,22 @@
-from .base import MultiModalData, MultiModalPlugin
-from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
+                   MultiModalInputs, MultiModalPlugin)
+from .registry import MultiModalRegistry
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to its modality and the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
 
 __all__ = [
-    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
-    "MultiModalRegistry"
+    "BatchedTensors",
+    "MultiModalDataBuiltins",
+    "MultiModalDataDict",
+    "MultiModalInputs",
+    "MultiModalPlugin",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 847752449ba80..3ebc25c5930cf 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,45 +1,122 @@
+import sys
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
-                    TypeVar)
+from collections import UserDict, defaultdict
+from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
+                    TypeVar, Union)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.logger import init_logger
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
 
-if TYPE_CHECKING:
-    import torch
-    from torch import nn
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext
+from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+"""
+If each input tensor in the batch has the same size, this is a single batched
+tensor; otherwise, this is a list of tensors with one element per batch.
+"""
 
-class MultiModalData:
-    """
-    Base class that contains multi-modal data.
+if sys.version_info < (3, 9):
+    # UserDict cannot be subscripted
+    class _MultiModalInputsBase(UserDict):
+        pass
+else:
 
-    To add a new modality, add a new file under ``multimodal`` directory.
+    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
+        pass
 
-    In this new file, subclass :class:`~MultiModalData` and
-    :class:`~MultiModalPlugin`.
 
-    Finally, register the new plugin to
-    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.register_input` for
-    the new modality.
+class MultiModalInputs(_MultiModalInputsBase):
     """
-    pass
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
+    """
+
+    @staticmethod
+    def try_concat(
+        tensors: List[torch.Tensor],
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensors:
+        unbatched_shape = tensors[0].shape[1:]
+
+        for tensor in tensors:
+            if tensor.shape[1:] != unbatched_shape:
+                return [
+                    tensor.squeeze(0).to(device=device) for tensor in tensors
+                ]
+
+        return torch.cat(tensors, dim=0).to(device=device)
+
+    @staticmethod
+    def batch(
+        inputs_list: List["MultiModalInputs"],
+        device: torch.types.Device,
+    ) -> Dict[str, BatchedTensors]:
+        """Batch multiple inputs together into a dictionary."""
+        if len(inputs_list) == 0:
+            return {}
+
+        keys = inputs_list[0].keys()
+
+        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+
+        for inputs in inputs_list:
+            if inputs.keys() != keys:
+                msg = f"Inputs do not share the same keys ({keys})"
+                raise ValueError(msg)
+
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+
+        return {
+            k: MultiModalInputs.try_concat(item_list, device=device)
+            for k, item_list in item_lists.items()
+        }
+
+
+class MultiModalDataBuiltins(TypedDict, total=False):
+    """Modality types that are predefined by vLLM."""
+
+    image: Image.Image
+    """The input image."""
+
+
+MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
+"""
+A dictionary containing an item for each modality type to input.
 
+Note:
+    This dictionary also accepts modality keys defined outside
+    :class:`MultiModalDataBuiltins` as long as a customized plugin is registered
+    through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+    Read more on that :ref:`here <adding_multimodal_plugin>`.
+"""
 
-D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
+MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
+"""
+Return a dictionary to be passed as keyword arguments to
+:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
+and processors in HuggingFace Transformers.
 
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
-"""Return a dictionary to be passed as keyword arguments to
-:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
-and processors in HuggingFace Transformers."""
+If the data is not supported, throw :exc:`TypeError`.
+"""
 
+MultiModalTokensCalc = Union[int, Callable[[InputContext], int]]
+"""
+Calculate the maximum number of multimodal tokens input to the language
+model. This does not include tokens that correspond to the input text.
+"""
 
-class MultiModalPlugin(ABC, Generic[D]):
+N = TypeVar("N", bound=Type[nn.Module])
+
+
+class MultiModalPlugin(ABC):
     """
     Base class that defines data processing logic for a specific modality.
 
@@ -48,79 +125,165 @@ class MultiModalPlugin(ABC, Generic[D]):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-    """
 
-    @classmethod
-    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        return get_model_architecture(model_config)[0]
+    See also:
+        :ref:`adding_multimodal_plugin`
+    """
 
     def __init__(self) -> None:
-        self._input_processors: Dict[Type["nn.Module"],
-                                     MultiModalInputProcessor[D]] = {}
+        self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
+        self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {}
 
     @abstractmethod
-    def get_data_type(self) -> Type[D]:
+    def get_data_key(self) -> str:
         """
-        Get the modality (subclass of :class:`~MultiModalData`) served by
-        this plugin.
+        Get the data key corresponding to the modality.
         """
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_processor(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
-        """Return a dictionary to be passed as keyword arguments to
-        :meth:`torch.nn.Module.forward`. This is similar in concept to
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: object) -> MultiModalInputs:
+        """
+        Return a dictionary to be passed as keyword arguments to
+        :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
+
+        If the data is not supported, throw :exc:`TypeError`.
         """
         raise NotImplementedError
 
-    def register_input_processor(self,
-                                 processor: Optional[
-                                     MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper] = None,
+    ):
         """
-        Register an input processor to a model class.
-        
+        Register an input mapper to a model class.
+
         When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided input processor is
-        applied to preprocess the data. If `None` is provided, then the default
-        input processor is applied instead.
+        this plugin (see :meth:`get_data_key`), the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+
+        If `None` is provided, then the default input mapper is used instead.
+
+        See also:
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors:
+            if model_cls in self._input_mappers:
                 logger.warning(
-                    "Model class %s already has an input processor "
+                    "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._input_processors[model_cls] = processor \
-                or self._default_input_processor
+            self._input_mappers[model_cls] = mapper \
+                or self._default_input_mapper
 
             return model_cls
 
         return wrapper
 
-    def process_input(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def map_input(self, model_config: ModelConfig,
+                  data: object) -> MultiModalInputs:
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
-        
-        The model is identified by ``model_config``. ``vlm_config`` is
-        for compatibility purposes and may be merged into ``model_config``
-        in the near future.
+        Transform the data into a dictionary of model inputs using the
+        input mapper registered for that model.
+
+        The model is identified by ``model_config``.
+
+        Raises:
+            TypeError: If the data type is not supported.
+
+        See also:
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
-        model_cls = self.get_model_cls(model_config)
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
 
-        processor = self._input_processors.get(model_cls)
-        if processor is None:
-            raise KeyError(f"No input processor in {self} is registered for "
+        mapper = self._input_mappers.get(model_cls)
+        if mapper is None:
+            raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return processor(data, model_config, vlm_config)
+        return mapper(InputContext(model_config), data)
+
+    @abstractmethod
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        """
+        Calculate the maximum number of multimodal tokens input to the language
+        model. This does not include tokens that correspond to the input text.
+        """
+        raise NotImplementedError
+
+    def _validate_max_multimodal_tokens(self, max_mm_tokens: int):
+        if max_mm_tokens < 1:
+            raise ValueError("You should set the number of tokens to a "
+                             f"positive integer. Found: {max_mm_tokens}")
+
+    def register_max_multimodal_tokens(
+        self,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
+        """
+        Register the maximum number of multi-modal tokens input to the
+        language model for a model class.
+
+        If `None` is provided, then the default calculation is used instead.
+
+        See also:
+            :ref:`enabling_multimodal_inputs`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._max_mm_tokens:
+                logger.warning(
+                    "Model class %s already calculates maximum number of "
+                    "tokens in %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            if isinstance(max_mm_tokens, int):
+                self._validate_max_multimodal_tokens(max_mm_tokens)
+
+            self._max_mm_tokens[model_cls] = max_mm_tokens \
+                or self._default_max_multimodal_tokens
+
+            return model_cls
+
+        return wrapper
+
+    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+
+        If this registry is not applicable to the model, `0` is returned.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`enabling_multimodal_inputs`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        if model_cls not in self._input_mappers:
+            return 0
+
+        max_mm_tokens = self._max_mm_tokens.get(model_cls)
+        if max_mm_tokens is None:
+            raise KeyError(f"No maximum number of multi-modal tokens is given "
+                           f"for model class {model_cls.__name__} in {self}.")
+
+        if callable(max_mm_tokens):
+            max_mm_tokens = max_mm_tokens(InputContext(model_config))
+
+        self._validate_max_multimodal_tokens(max_mm_tokens)
+
+        return max_mm_tokens
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 08fb09d111605..3b37ce9149fb8 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,155 +1,136 @@
-from typing import Dict, Tuple, Type, Union
+from functools import lru_cache
+from typing import List, Optional, Tuple, TypeVar
 
 import torch
 from PIL import Image
+from transformers import PreTrainedTokenizerBase
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.sequence import SequenceData
-from vllm.transformers_utils.image_processor import cached_get_image_processor
+from vllm.transformers_utils.image_processor import get_image_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalInputs, MultiModalPlugin
 
 logger = init_logger(__name__)
 
-
-def _get_dummy_seq_data(seq_len: int,
-                        vlm_config: VisionLanguageConfig) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
-
-    return SequenceData(token_ids)
-
-
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
+cached_get_image_processor = lru_cache(get_image_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+# Utilities for image input processors
+_T = TypeVar("_T", str, int)
+
+
+def repeat_and_pad_token(
+    token: _T,
+    *,
+    repeat_count: int = 1,
+    pad_token_left: Optional[_T] = None,
+    pad_token_right: Optional[_T] = None,
+) -> List[_T]:
+    replacement = [token] * repeat_count
+    if pad_token_left is not None:
+        replacement = [pad_token_left] + replacement
+    if pad_token_right is not None:
+        replacement = replacement + [pad_token_right]
+
+    return replacement
+
+
+def repeat_and_pad_image_tokens(
+    tokenizer: PreTrainedTokenizerBase,
+    prompt: Optional[str],
+    prompt_token_ids: List[int],
+    *,
+    image_token_id: int,
+    repeat_count: int = 1,
+    pad_token_left: Optional[int] = None,
+    pad_token_right: Optional[int] = None,
+) -> Tuple[Optional[str], List[int]]:
+    if prompt is None:
+        new_prompt = None
     else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
-
-
-def get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    """Standard dummy data factory for image data (to be used in
-    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
-    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
-
-
-class ImagePixelData(MultiModalData):
-    """
-    The pixel data of an image. Can be one of:
-
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
-      processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
-      without additional pre-processing.
-    """
-
-    def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
-        if isinstance(image, Image.Image):
-            # So that this class can be created inside the Image context manager
-            image.load()
-
-        self.image = image
-
-    def __repr__(self) -> str:
-        image = self.image
-        if isinstance(image, Image.Image):
-            return f"{type(self).__name__}(image={image})"
-
-        return (f"{type(self).__name__}(image=torch.Tensor(shape="
-                f"{image.shape}, dtype={image.dtype}))")
-
-
-class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
-
-    def get_data_type(self) -> Type[ImagePixelData]:
-        return ImagePixelData
-
-    def _get_hf_image_processor(self, model_config: ModelConfig,
-                                vlm_config: VisionLanguageConfig):
-        if vlm_config is None or vlm_config.image_processor is None:
-            return None
-
+        image_token_str = tokenizer.decode(image_token_id)
+        pad_token_str_left = (None if pad_token_left is None else
+                              tokenizer.decode(pad_token_left))
+        pad_token_str_right = (None if pad_token_right is None else
+                               tokenizer.decode(pad_token_right))
+        replacement_str = "".join(
+            repeat_and_pad_token(
+                image_token_str,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_str_left,
+                pad_token_right=pad_token_str_right,
+            ))
+
+        image_token_count = prompt.count(image_token_str)
+        # This is an arbitrary number to distinguish between the two cases
+        if image_token_count > 16:
+            logger.warning(
+                "Please follow the prompt format that is "
+                "documented on HuggingFace which does not involve "
+                "repeating %s tokens.", image_token_str)
+        elif image_token_count > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        # The image tokens are removed to be consistent with HuggingFace
+        new_prompt = prompt.replace(image_token_str, replacement_str, 1)
+
+    new_token_ids: List[int] = []
+    for i, token in enumerate(prompt_token_ids):
+        if token == image_token_id:
+            replacement_ids = repeat_and_pad_token(
+                image_token_id,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_left,
+                pad_token_right=pad_token_right,
+            )
+            new_token_ids.extend(replacement_ids)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + 1:])
+            break
+        else:
+            new_token_ids.append(token)
+
+    return new_prompt, new_token_ids
+
+
+class ImagePlugin(MultiModalPlugin):
+    """Plugin for image data."""
+
+    def get_data_key(self) -> str:
+        return "image"
+
+    def _get_hf_image_processor(self, model_config: ModelConfig):
         return cached_get_image_processor(
-            vlm_config.image_processor,
-            trust_remote_code=model_config.trust_remote_code,
-            revision=vlm_config.image_processor_revision,
-        )
-
-    def _default_input_processor(
-            self, data: ImagePixelData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
-        image = data.image
-
-        if isinstance(image, Image.Image):
-            image_processor = self._get_hf_image_processor(
-                model_config, vlm_config)
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
+
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: object) -> MultiModalInputs:
+        model_config = ctx.model_config
+        if isinstance(data, Image.Image):
+            image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
-                raise RuntimeError("No HuggingFace processor is available"
+                raise RuntimeError("No HuggingFace processor is available "
                                    "to process the image object")
             try:
-                return image_processor.preprocess(image, return_tensors="pt") \
-                    .to(model_config.dtype).data
+                batch_data = image_processor \
+                    .preprocess(data, return_tensors="pt") \
+                    .data
             except Exception:
-                logger.error("Failed to process image (%s)", image)
+                logger.error("Failed to process image (%s)", data)
                 raise
-        elif isinstance(image, torch.Tensor):
-            pixel_values = image.to(model_config.dtype)
-
-            return {"pixel_values": pixel_values}
-
-        raise TypeError(f"Invalid image type: {type(image)}")
-
-
-class ImageFeatureData(MultiModalData):
-    """
-    The feature vector of an image, passed directly to the model.
-
-    This should be the output of the vision tower.
-    """
-
-    def __init__(self, image_features: torch.Tensor) -> None:
-        self.image_features = image_features
-
-    def __repr__(self) -> str:
-        image_features = self.image_features
-
-        return (f"{type(self).__name__}(image_features=torch.Tensor(shape="
-                f"{image_features.shape}, dtype={image_features.dtype}))")
-
-
-class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
 
-    def get_data_type(self) -> Type[ImageFeatureData]:
-        return ImageFeatureData
+            return MultiModalInputs(batch_data)
+        elif isinstance(data, torch.Tensor):
+            raise NotImplementedError("Embeddings input is not supported yet")
 
-    def _default_input_processor(
-            self, data: ImageFeatureData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
-        image_features = data.image_features.to(model_config.dtype)
+        raise TypeError(f"Invalid image type: {type(data)}")
 
-        return {"image_features": image_features}
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 3000
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4789ce5ce4cfe..d8e1b68178acd 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,156 +1,142 @@
 import functools
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
-                    Tuple, Type, TypeVar)
+from typing import Dict, Optional, Sequence
 
-from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.logger import init_logger
-
-from .base import MultiModalData, MultiModalPlugin
-from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
-                    ImagePixelPlugin)
+import torch
 
-if TYPE_CHECKING:
-    import torch
-    from torch import nn
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
 
-    from vllm.sequence import SequenceData
+from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
+                   MultiModalPlugin, MultiModalTokensCalc)
+from .image import ImagePlugin
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
-
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
-MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
-                                  Tuple["SequenceData", MultiModalData]]
-
 
 class MultiModalRegistry:
     """
-    This registry is used by model runners to dispatch data processing
-    according to its modality and the target model.
+    A registry that dispatches data processing to the
+    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
     """
 
-    DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), )
+
+    def __init__(
+            self,
+            *,
+            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
+        self._plugins = {p.get_data_key(): p for p in plugins}
 
-    def __init__(self,
-                 *,
-                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
-                 ) -> None:
-        self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
-        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
-                                                  MultiModalDummyFactory] = {}
+    def register_plugin(self, plugin: MultiModalPlugin) -> None:
+        """
+        Register a multi-modal plugin so it can be recognized by vLLM.
 
-    def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
-        data_type = plugin.get_data_type()
+        See also:
+            :ref:`adding_multimodal_plugin`
+        """
+        data_type_key = plugin.get_data_key()
 
-        if data_type in self._plugins_by_data_type:
+        if data_type_key in self._plugins:
             logger.warning(
                 "A plugin is already registered for data type %s, "
-                "and will be overwritten by the new plugin %s.", data_type,
+                "and will be overwritten by the new plugin %s.", data_type_key,
                 plugin)
 
-        self._plugins_by_data_type[data_type] = plugin
+        self._plugins[data_type_key] = plugin
 
-    def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
-        for typ in data_type.mro():
-            plugin = self._plugins_by_data_type.get(typ)
-            if plugin is not None:
-                return plugin
+    def _get_plugin(self, data_type_key: str):
+        plugin = self._plugins.get(data_type_key)
+        if plugin is not None:
+            return plugin
 
-        msg = f"Unknown multi-modal data type: {data_type}"
+        msg = f"Unknown multi-modal data type: {data_type_key}"
         raise NotImplementedError(msg)
 
-    def register_dummy_data(self, factory: MultiModalDummyFactory):
+    def register_input_mapper(
+        self,
+        data_type_key: str,
+        mapper: Optional[MultiModalInputMapper] = None,
+    ):
         """
-        Register a dummy data factory to a model class.
+        Register an input mapper for a specific modality to a model class.
 
-        During memory profiling, the provided function is invoked to create
-        dummy data to be inputted into the model. The modality and shape of
-        the dummy data should be an upper bound of what the model would receive
-        at inference time.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
+        return self._get_plugin(data_type_key).register_input_mapper(mapper)
 
-        def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
-                logger.warning(
-                    "Model class %s already has dummy data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
+    def register_image_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper] = None,
+    ):
+        """
+        Register an input mapper for image data to a model class.
 
-            self._dummy_factories_by_model_type[model_cls] = factory
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
+        """
+        return self.register_input_mapper("image", mapper)
 
-            return model_cls
+    def map_input(self, model_config: ModelConfig,
+                  data: MultiModalDataDict) -> MultiModalInputs:
+        """
+        Apply an input mapper to the data passed to the model.
 
-        return wrapper
+        The data belonging to each modality is passed to the corresponding
+        plugin which in turn converts the data into into keyword arguments
+        via the input mapper registered for that model.
 
-    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
-                                 vlm_config: VisionLanguageConfig):
-        """Create dummy data for memory profiling."""
-        model_cls = MultiModalPlugin.get_model_cls(model_config)
-        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
-        if dummy_factory is None:
-            msg = f"No dummy data defined for model class: {model_cls}"
-            raise NotImplementedError(msg)
+        See :meth:`MultiModalPlugin.map_input` for more details.
+        """
+        merged_dict: Dict[str, torch.Tensor] = {}
 
-        return dummy_factory(seq_len, model_config, vlm_config)
+        for data_key, data_value in data.items():
+            input_dict = self._get_plugin(data_key) \
+                .map_input(model_config, data_value)
 
-    def register_input(
-            self,
-            data_type: Type[D],
-            processor: Optional[MultiModalInputProcessor[D]] = None):
-        """
-        Register an input processor for a specific modality to a model class.
+            for input_key, input_tensor in input_dict.items():
+                if input_key in merged_dict:
+                    raise ValueError(f"The input mappers (keys={set(data)}) "
+                                     f"resulted in a conflicting keyword "
+                                     f"argument to `forward()`: {input_key}")
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
-        """
-        return self._get_plugin_for_data_type(data_type) \
-            .register_input_processor(processor)
+                merged_dict[input_key] = input_tensor
 
-    def register_image_pixel_input(
-            self,
-            processor: Optional[
-                MultiModalInputProcessor[ImagePixelData]] = None):
-        """
-        Register an input processor for image pixel data to a model class.
+        return MultiModalInputs(merged_dict)
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+    def create_input_mapper(self, model_config: ModelConfig):
+        """
+        Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return self.register_input(ImagePixelData, processor)
+        return functools.partial(self.map_input, model_config)
 
-    def register_image_feature_input(
+    def register_max_multimodal_tokens(
         self,
-        processor: Optional[
-            MultiModalInputProcessor[ImageFeatureData]] = None):
+        data_type_key: str,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
         """
-        Register an input processor for image feature data to a model class.
-
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        Register the maximum number of tokens, belonging to a
+        specific modality, input to the language model for a model class.
         """
-        return self.register_input(ImageFeatureData, processor)
+        return self._get_plugin(data_type_key) \
+            .register_max_multimodal_tokens(max_mm_tokens)
 
-    def process_input(self, data: MultiModalData, model_config: ModelConfig,
-                      vlm_config: VisionLanguageConfig):
+    def register_max_image_tokens(
+        self,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
-        
-        See :meth:`MultiModalPlugin.process_input` for more details.
+        Register the maximum number of image tokens
+        input to the language model for a model class.
         """
-        return self._get_plugin_for_data_type(type(data)) \
-            .process_input(data, model_config, vlm_config)
+        return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
-    def create_input_processor(self, model_config: ModelConfig,
-                               vlm_config: VisionLanguageConfig):
+    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         """
-        Create an input processor (see :meth:`process_input`) for a
-        specific model.
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+        
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
-        return functools.partial(self.process_input,
-                                 model_config=model_config,
-                                 vlm_config=vlm_config)
-
-
-MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
+        return sum(
+            plugin.get_max_multimodal_tokens(model_config)
+            for plugin in self._plugins.values())
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 509f791d27c6f..bafd208469788 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,87 +1,97 @@
 import base64
 from io import BytesIO
-from typing import Optional, Union
+from typing import Union
 
-import aiohttp
 from PIL import Image
 
-from vllm.config import ModelConfig
+from vllm.connections import global_http_connection
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.base import MultiModalDataDict
 
 
-class ImageFetchAiohttp:
-    aiohttp_client: Optional[aiohttp.ClientSession] = None
+def _load_image_from_bytes(b: bytes):
+    image = Image.open(BytesIO(b))
+    image.load()
+    return image
 
-    @classmethod
-    def get_aiohttp_client(cls) -> aiohttp.ClientSession:
-        if cls.aiohttp_client is None:
-            timeout = aiohttp.ClientTimeout(total=VLLM_IMAGE_FETCH_TIMEOUT)
-            connector = aiohttp.TCPConnector()
-            cls.aiohttp_client = aiohttp.ClientSession(timeout=timeout,
-                                                       connector=connector)
 
-        return cls.aiohttp_client
+def _load_image_from_data_url(image_url: str):
+    # Only split once and assume the second part is the base64 encoded image
+    _, image_base64 = image_url.split(",", 1)
+    return load_image_from_base64(image_base64)
 
-    @classmethod
-    async def fetch_image(cls, image_url: str) -> Image.Image:
-        """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
-        if image_url.startswith('http'):
-            # Avoid circular import
-            from vllm import __version__ as VLLM_VERSION
+def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
+    """
+    Load a PIL image from a HTTP or base64 data URL.
 
-            client = cls.get_aiohttp_client()
-            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+    By default, the image is converted into RGB format.
+    """
+    if image_url.startswith('http'):
+        image_raw = global_http_connection.get_bytes(
+            image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+        image = _load_image_from_bytes(image_raw)
 
-            async with client.get(url=image_url, headers=headers) as response:
-                response.raise_for_status()
-                image_raw = await response.read()
-            image = Image.open(BytesIO(image_raw))
+    elif image_url.startswith('data:image'):
+        image = _load_image_from_data_url(image_url)
+    else:
+        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
+                         "with either 'data:image' or 'http'.")
+
+    return image.convert(image_mode)
 
-        # Only split once and assume the second part is the base64 encoded image
-        elif image_url.startswith('data:image'):
-            image = load_image_from_base64(image_url.split(',', 1)[1])
 
-        else:
-            raise ValueError("Invalid image url: A valid image url must start "
-                             "with either 'data:image' or 'http'.")
+async def async_fetch_image(image_url: str,
+                            *,
+                            image_mode: str = "RGB") -> Image.Image:
+    """
+    Asynchronously load a PIL image from a HTTP or base64 data URL.
 
-        return image
+    By default, the image is converted into RGB format.
+    """
+    if image_url.startswith('http'):
+        image_raw = await global_http_connection.async_get_bytes(
+            image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+        image = _load_image_from_bytes(image_raw)
+
+    elif image_url.startswith('data:image'):
+        image = _load_image_from_data_url(image_url)
+    else:
+        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
+                         "with either 'data:image' or 'http'.")
 
+    return image.convert(image_mode)
 
-async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
-    with await ImageFetchAiohttp.fetch_image(image_url) as image:
-        return ImagePixelData(image)
 
+async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
+    image = await async_fetch_image(image_url)
+    return {"image": image}
 
-def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
-    """encode image to base64 format."""
 
+def encode_image_base64(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "JPEG",
+) -> str:
+    """
+    Encode a pillow image to base64 format.
+
+    By default, the image is converted into RGB format before being encoded.
+    """
     buffered = BytesIO()
-    if format == 'JPEG':
-        image = image.convert('RGB')
+    image = image.convert(image_mode)
     image.save(buffered, format)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     """Load image from base64 format."""
-    return Image.open(BytesIO(base64.b64decode(image)))
-
+    return _load_image_from_bytes(base64.b64decode(image))
 
-# TODO(ywang96): move this to a model registry for preprocessing vision
-# language prompts based on the model type.
-def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
-                               config: ModelConfig) -> str:
-    """Combine image and text prompts for vision language model depending on
-    the model architecture."""
 
-    if config.hf_config.model_type in ("llava", "llava_next"):
-        full_prompt = f"{image_prompt}\n{text_prompt}"
-    elif config.hf_config.model_type == 'phi3_v':
-        full_prompt = f"{image_prompt}<s>\n{text_prompt}"
-    else:
-        raise ValueError(
-            f"Unsupported model type: {config.hf_config.model_type}")
-    return full_prompt
+def rescale_image_size(image: Image.Image, size_factor: float) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    return image.resize((new_width, new_height))
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 49f526b5f9300..4cb7f06bdb8c7 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,6 +1,6 @@
 import time
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
@@ -28,7 +28,7 @@ class CompletionOutput:
 
     index: int
     text: str
-    token_ids: List[int]
+    token_ids: Tuple[int, ...]
     cumulative_logprob: float
     logprobs: Optional[SampleLogprobs]
     finish_reason: Optional[str] = None
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
new file mode 100644
index 0000000000000..eac917786bd6b
--- /dev/null
+++ b/vllm/platforms/__init__.py
@@ -0,0 +1,23 @@
+from typing import Optional
+
+import torch
+
+from vllm.utils import is_tpu
+
+from .interface import Platform, PlatformEnum, UnspecifiedPlatform
+
+current_platform: Optional[Platform]
+
+if torch.version.cuda is not None:
+    from .cuda import CudaPlatform
+    current_platform = CudaPlatform()
+elif torch.version.hip is not None:
+    from .rocm import RocmPlatform
+    current_platform = RocmPlatform()
+elif is_tpu():
+    from .tpu import TpuPlatform
+    current_platform = TpuPlatform()
+else:
+    current_platform = UnspecifiedPlatform()
+
+__all__ = ['Platform', 'PlatformEnum', 'current_platform']
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
new file mode 100644
index 0000000000000..02ba227460e3f
--- /dev/null
+++ b/vllm/platforms/cuda.py
@@ -0,0 +1,49 @@
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+import os
+from functools import lru_cache, wraps
+from typing import Tuple
+
+import pynvml
+
+from .interface import Platform, PlatformEnum
+
+
+def with_nvml_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
+class CudaPlatform(Platform):
+    _enum = PlatformEnum.CUDA
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return get_physical_device_capability(physical_device_id)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
new file mode 100644
index 0000000000000..0760f9554fb78
--- /dev/null
+++ b/vllm/platforms/interface.py
@@ -0,0 +1,42 @@
+import enum
+from typing import Tuple
+
+import torch
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    UNSPECIFIED = enum.auto()
+
+
+class Platform:
+    _enum: PlatformEnum
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        raise NotImplementedError
+
+    @staticmethod
+    def inference_mode():
+        """A device-specific wrapper of `torch.inference_mode`.
+
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+
+
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
new file mode 100644
index 0000000000000..36b3ba8f7d1bb
--- /dev/null
+++ b/vllm/platforms/rocm.py
@@ -0,0 +1,15 @@
+from functools import lru_cache
+from typing import Tuple
+
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+
+    @staticmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        return torch.cuda.get_device_capability(device_id)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
new file mode 100644
index 0000000000000..5e32bee1c5511
--- /dev/null
+++ b/vllm/platforms/tpu.py
@@ -0,0 +1,17 @@
+from typing import Tuple
+
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        raise RuntimeError("TPU does not have device capability.")
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
diff --git a/vllm/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
new file mode 100644
index 0000000000000..27a61e692e1b7
--- /dev/null
+++ b/vllm/prompt_adapter/layers.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.adapter_commons.layers import AdapterMapping
+from vllm.config import PromptAdapterConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+
+
+@dataclass
+class PromptAdapterMapping(AdapterMapping):
+    pass
+
+
+class VocabParallelEmbeddingWithPromptAdapter(nn.Module):
+
+    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.emb_layer = self.base_layer
+        if 'LoRA' in base_layer.__class__.__name__:
+            self.emb_layer = self.base_layer.base_layer
+
+    def create_prompt_adapter_weights(
+            self, prompt_adapter_config: PromptAdapterConfig):
+        self.embeddings_tensors = torch.zeros(
+            (
+                prompt_adapter_config.max_prompt_adapters,
+                prompt_adapter_config.max_prompt_adapter_token,
+                self.emb_layer.embedding_dim,
+            ),
+            dtype=self.emb_layer.weight.dtype,
+            device=self.emb_layer.weight.device,
+        )
+        self.adapter_lengths = torch.zeros(
+            prompt_adapter_config.max_prompt_adapters,
+            dtype=torch.long,
+            device=self.emb_layer.weight.device)
+
+        self.indices_gpu: torch.Tensor
+        self.embedding_indices_gpu: torch.Tensor
+
+    def reset_prompt_adapter(self, index: int):
+        self.embeddings_tensors[index] = 0
+
+    def set_prompt_adapter(
+        self,
+        index: int,
+        adapter_model: Optional[torch.Tensor],
+    ):
+        self.reset_prompt_adapter(index)
+        if adapter_model is not None:
+            length = adapter_model.shape[0]
+            self.embeddings_tensors[index, :length] = adapter_model
+            self.adapter_lengths[index] = length
+
+    def set_mapping(
+        self,
+        prompt_indices: torch.Tensor,
+        prompt_embedding_indices: torch.Tensor,
+    ):
+        self.indices_gpu = prompt_indices.to(
+            device=self.emb_layer.weight.device)
+        self.embedding_indices_gpu = prompt_embedding_indices.to(
+            device=self.emb_layer.weight.device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.base_layer(x)
+        if self.embedding_indices_gpu.ndim > 1:
+            valid_mask = self.indices_gpu != -1
+            gathered_embeddings = self.embeddings_tensors[
+                self.embedding_indices_gpu[:, 0],
+                self.embedding_indices_gpu[:, 1]]
+
+            # Update hidden states
+            hidden_states[valid_mask] = gathered_embeddings
+        return hidden_states
\ No newline at end of file
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
new file mode 100644
index 0000000000000..93eb3bde646ac
--- /dev/null
+++ b/vllm/prompt_adapter/models.py
@@ -0,0 +1,355 @@
+import logging
+import math
+from typing import Any, Callable, Dict, List, Optional, Type
+
+import torch
+from torch import nn
+
+from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
+                                         AdapterModelManager)
+from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
+                                        get_adapter, list_adapters,
+                                        remove_adapter, set_adapter_mapping)
+from vllm.config import PromptAdapterConfig
+from vllm.prompt_adapter.layers import (
+    VocabParallelEmbeddingWithPromptAdapter)  # yapf: disable
+from vllm.prompt_adapter.layers import PromptAdapterMapping
+
+logger = logging.getLogger(__name__)
+
+_GLOBAL_PROMPT_ADAPTER_ID = 0
+
+
+def get_prompt_adapter_id():
+    global _GLOBAL_PROMPT_ADAPTER_ID
+    _GLOBAL_PROMPT_ADAPTER_ID += 1
+    return _GLOBAL_PROMPT_ADAPTER_ID
+
+
+def convert_to_embedding_indices(indices):
+    embedding_indices = []
+    count = 0
+
+    for value in indices:
+        if value == -1:
+            count = 0
+        else:
+            embedding_indices.append([value, count])
+            count += 1
+
+    return torch.tensor(embedding_indices)
+
+
+def convert_mapping(
+    mapping: PromptAdapterMapping,
+    prompt_adapter_index_to_id: List[Optional[int]],
+) -> torch.Tensor:
+    """Converts PromptAdapterMapping to index tensors.
+
+    Args:
+        mapping: PromptAdapterMapping mapping rows in a 
+                batch to PromptAdapter ids.
+        prompt_adapter_index_to_id: List mapping PromptAdapter 
+                ids to PromptAdapter indices.
+        
+    Returns:
+        pa_indices: Tensor of shape [batch_size] mapping batch rows to
+            PromptAdapter indices.
+    """
+    id_to_index = {
+        id_: idx
+        for idx, id_ in enumerate(prompt_adapter_index_to_id)
+        if id_ is not None
+    }
+    pa_indices = ([
+        id_to_index.get(id_, -1) if id_ > 0 else -1
+        for id_ in mapping.index_mapping
+    ])
+
+    pa_embedding_mapping = convert_to_embedding_indices(pa_indices)
+    pa_indices = torch.tensor(pa_indices)
+    return pa_indices, pa_embedding_mapping
+
+
+class PromptAdapterModel(AdapterModel):
+
+    def __init__(self,
+                 prompt_adapter_id=None,
+                 num_virtual_tokens=None,
+                 prompt_embedding=None) -> None:
+        self.id = prompt_adapter_id
+        self.prompt_embedding = prompt_embedding
+        self.num_virtual_tokens = num_virtual_tokens
+
+    @classmethod
+    def from_local_checkpoint(
+        cls,
+        adapter_model_path: str,
+        prompt_adapter_id: int,
+        num_virtual_tokens: int,
+        config: PromptAdapterConfig,
+        device: str = "cuda",
+    ) -> "PromptAdapterModel":
+        from peft.utils import load_peft_weights
+
+        if num_virtual_tokens > config.max_prompt_adapter_token:
+            raise ValueError(
+                f'num_virtual_tokens ({num_virtual_tokens}) should be <= '
+                f'max_prompt_adapter_token({config.max_prompt_adapter_token})')
+
+        adapters_weights = load_peft_weights(adapter_model_path, device)
+        prompt_embedding = adapters_weights["prompt_embeddings"].to(
+            config.prompt_adapter_dtype)
+
+        return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding)
+
+
+class PromptAdapterModelManager(AdapterModelManager):
+    """A manager that manages multiple Prompt Adapter models."""
+
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+    ):
+        """Create a PromptAdapterModel and adapter for a given model.
+
+        Args:
+            model: the model to be adapted.
+            max_num_seqs: the maximum number of sequences model can run in a
+                single batch.
+            max_num_batched_tokens: the maximum number of tokens model can run
+                in a single batch.
+            prompt_adapter_config: the PromptAdapter config,
+        """
+        self.model: nn.Module = model
+        # Dict instead of a Set for compatibility with LRUCache.
+        self.prompt_adapter_index_to_id: List[
+            Optional[int]] = [None] * self.prompt_adapter_slots
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
+        self.prompt_adapter_config = prompt_adapter_config
+        self.model.prompt_adapter_manager = self
+        self.adapter_type = 'PromptAdapter'
+
+        self.base_indices = torch.tensor([-1])
+        self.base_embedding_indices = torch.tensor([])
+
+        self.modules: Dict[str, nn.Module] = {}
+        self._create_prompt_adapter_modules()
+        self._last_mapping: Optional[PromptAdapterMapping] = None
+
+    @property
+    def prompt_adapter_slots(self) -> int:
+        return self.prompt_adapter_config.max_prompt_adapters
+
+    @property
+    def adapter_slots(self) -> int:
+        return self.prompt_adapter_slots
+
+    @property
+    def capacity(self) -> int:
+        return self.prompt_adapter_config.max_cpu_prompt_adapters
+
+    def activate_adapter(
+        self,
+        prompt_adapter_id: int,
+    ) -> bool:
+        """Move PromptAdapter into a GPU buffer 
+            to be used in the forward pass."""
+        if prompt_adapter_id in self._active_adapters:
+            return False
+        first_free_slot = next(
+            ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate(
+                self.prompt_adapter_index_to_id) if prompt_adapter_id is None),
+            None)
+        if first_free_slot is None:
+            raise ValueError("No free prompt_adapter slots")
+        index, _ = first_free_slot
+        self._active_adapters[prompt_adapter_id] = None
+        prompt_adapter_model = (self._registered_adapters[prompt_adapter_id])
+        logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
+                     prompt_adapter_model.id, index)
+        self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
+        for _, v in self.modules.items():
+            v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding)
+        return True
+
+    def _deactivate_adapter(self, prompt_adapter_id: int):
+        try:
+            index = self.prompt_adapter_index_to_id.index(prompt_adapter_id)
+            self.prompt_adapter_index_to_id[index] = None
+            for _, v in self.modules.items():
+                v.reset_prompt_adapter(index)
+        except ValueError:
+            pass
+
+    def _add_adapter(self, prompt_adapter: PromptAdapterModel):
+        self._registered_adapters[prompt_adapter.id] = prompt_adapter
+
+    def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
+        base_indices, base_embedding_indices = convert_mapping(
+            mapping, self.prompt_adapter_index_to_id)
+        for k, v in self.modules.items():
+            v.set_mapping(base_indices, base_embedding_indices)
+
+    def _create_prompt_adapter_modules(self):
+        for module_name, module in self.model.named_modules(
+                remove_duplicate=False):
+            if "VocabParallel" in module.__class__.__name__:
+                new_module = VocabParallelEmbeddingWithPromptAdapter(module)
+                new_module.create_prompt_adapter_weights(
+                    self.prompt_adapter_config)
+                replaced_module = self.replace_submodule(
+                    self.model, module_name, new_module)
+                self.register_module(module.__class__.__name__,
+                                     replaced_module)
+                replaced_module.set_mapping(self.base_indices,
+                                            self.base_embedding_indices)
+                break
+
+    def replace_submodule(self, model: nn.Module, module_name: str,
+                          new_module: nn.Module) -> nn.Module:
+        """Replace a submodule in a model with a new module."""
+        parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+        target_name = module_name.split(".")[-1]
+        setattr(parent, target_name, new_module)
+        return new_module
+
+    def register_module(self, module_name: str, module: nn.Module):
+        self.modules[module_name] = module
+
+    def pin_adapter(self, prompt_adapter_id: int) -> bool:
+        """Pin a PromptAdapterModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in PromptAdapterModelManager."
+            "Use LRUCachePromptAdapterModelManager for pinning"
+        )  # type: ignore
+
+    def remove_all_adapters(self):
+        """Remove all PromptAdapterModel from the manager."""
+        self._registered_adapters.clear()
+        self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots
+        self._active_adapters.clear()
+
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        return deactivate_adapter(adapter_id, self._active_adapters,
+                                  self._deactivate_adapter)
+
+    def add_adapter(self, adapter: PromptAdapterModel) -> bool:
+        return add_adapter(adapter, self._registered_adapters, self.capacity,
+                           self._add_adapter)
+
+    def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
+        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
+                                                 self._set_adapter_mapping)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return remove_adapter(adapter_id, self._registered_adapters,
+                              self.deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, Any]:
+        return list_adapters(self._registered_adapters)
+
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        return get_adapter(adapter_id, self._registered_adapters)
+
+
+class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]):
+
+    def __init__(self, capacity: int,
+                 deactivate_prompt_adapter_fn: Callable[[int], bool]):
+        super().__init__(capacity, deactivate_prompt_adapter_fn)
+
+
+class LRUCachePromptAdapterModelManager(PromptAdapterModelManager):
+    """A model manager that manages multiple prompt_adapters with LRU cache."""
+
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+    ):
+        self.prompt_adapter_config = prompt_adapter_config
+        super().__init__(model, max_num_seqs, max_num_batched_tokens,
+                         prompt_adapter_config)
+        self._registered_adapters = PromptAdapterLRUCache(
+            self.capacity, self.deactivate_adapter)
+        self._active_adapters = PromptAdapterLRUCache(
+            self.prompt_adapter_slots, self._deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, PromptAdapterModel]:
+        """List all registered PromptAdapterModel."""
+        return dict(self._registered_adapters.cache)
+
+    def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool:
+        """Add a PromptAdapterModel to the manager."""
+        if prompt_adapter.id not in self._registered_adapters:
+            self._add_adapter(prompt_adapter)
+            was_added = True
+        else:
+            # We always touch to update the LRU cache order
+            self._registered_adapters.touch(prompt_adapter.id)
+            was_added = False
+        return was_added
+
+    def activate_adapter(
+        self,
+        prompt_adapter_id: int,
+    ) -> bool:
+        if prompt_adapter_id not in self._active_adapters and len(
+                self._active_adapters) >= self.prompt_adapter_slots:
+            self._active_adapters.remove_oldest()
+        result = super().activate_adapter(prompt_adapter_id)
+        # We always touch to update the LRU cache order
+        self._active_adapters.touch(prompt_adapter_id)
+        return result
+
+    def remove_oldest_adapter(self) -> bool:
+        if len(self._registered_adapters) > 0:
+            self._registered_adapters.remove_oldest()
+            return True
+        return False
+
+    def pin_adapter(self, prompt_adapter_id: int) -> bool:
+        """Pin a PromptAdapterModel in the manager cache."""
+        self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id)
+        self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id)
+        return True
+
+    def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int):
+        try:
+            self._registered_adapters.pin(prompt_adapter_id)
+        except ValueError as err:
+            raise ValueError(
+                "Pinning failed. "
+                f"Prompt Adapter {prompt_adapter_id} is not registered."
+            ) from err
+
+    def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int):
+        if prompt_adapter_id not in self._active_adapters:
+            # move adapter to gpu if not already active
+            self.activate_adapter(prompt_adapter_id)
+        self._active_adapters.pin(prompt_adapter_id)
+
+
+def create_prompt_adapter_manager(
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+        prompt_adapter_manager_cls: Type[
+            PromptAdapterModelManager] = PromptAdapterModelManager,
+        **kwargs) -> PromptAdapterModelManager:
+    """Create a PromptAdapterModel for a given model."""
+    prompt_adapter_manager = prompt_adapter_manager_cls(
+        model=model,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        prompt_adapter_config=prompt_adapter_config,
+        **kwargs)
+    return prompt_adapter_manager
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
new file mode 100644
index 0000000000000..c0c98cf72bbae
--- /dev/null
+++ b/vllm/prompt_adapter/request.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+
+from vllm.adapter_commons.request import AdapterRequest
+
+
+@dataclass
+class PromptAdapterRequest(AdapterRequest):
+    """
+    Request for a Prompt adapter.
+    """
+
+    prompt_adapter_name: str
+    prompt_adapter_id: int
+    prompt_adapter_local_path: str
+    prompt_adapter_num_virtual_tokens: int
+
+    def __hash__(self):
+        return super().__hash__()
+
+    @property
+    def adapter_id(self):
+        return self.prompt_adapter_id
+
+    @property
+    def name(self):
+        return self.prompt_adapter_name
+
+    @property
+    def local_path(self):
+        return self.prompt_adapter_local_path
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
new file mode 100644
index 0000000000000..ddc1ef893c6f2
--- /dev/null
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -0,0 +1,176 @@
+import logging
+from typing import Any, Optional, Set, Type
+
+import torch
+
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
+from vllm.config import PromptAdapterConfig
+from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
+                                        PromptAdapterModel,
+                                        PromptAdapterModelManager,
+                                        create_prompt_adapter_manager)
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+logger = logging.getLogger(__name__)
+
+
+class WorkerPromptAdapterManager(AbstractWorkerManager):
+    """WorkerPromptAdapterManager that manages 
+    prompt_adapter models on the worker side.
+
+    Every request, the requested prompt_adapters will be 
+    loaded (unless they are already loaded), 
+    and every other prompt_adapter will be unloaded."""
+
+    _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
+
+    def __init__(
+        self,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        prompt_adapter_config: PromptAdapterConfig,
+        prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
+    ):
+        self._adapter_manager: PromptAdapterModelManager
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self._prompt_adapter_model_cls = prompt_adapter_model_cls
+        self.prompt_adapter_config = prompt_adapter_config
+        super().__init__(device)
+
+    @property
+    def is_enabled(self) -> bool:
+        return True
+
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._manager_cls,
+        )
+        self._adapter_manager = prompt_adapter_manager
+        return prompt_adapter_manager.model
+
+    def _load_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest
+    ) -> PromptAdapterModel:
+        try:
+            prompt_adapter = (
+                self._prompt_adapter_model_cls.from_local_checkpoint(
+                    prompt_adapter_request.prompt_adapter_local_path,
+                    prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
+                    num_virtual_tokens=prompt_adapter_request.
+                    prompt_adapter_num_virtual_tokens,
+                    config=self.prompt_adapter_config,
+                    device=str(self.device),
+                ))
+        except Exception as e:
+            raise RuntimeError(
+                f"Loading prompt_adapter "
+                f"{prompt_adapter_request.prompt_adapter_local_path}"
+                f" failed") from e
+        return prompt_adapter
+
+    def add_dummy_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return True
+
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
+
+    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
+
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
+
+    def list_adapters(self) -> Set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
+
+
+class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
+    """WorkerPromptAdapterManager that manages 
+    prompt_adapter models on the worker side.
+
+    Uses an LRU Cache. Every request, the requested 
+    prompt_adapters will be loaded (unless they are already loaded) 
+    and least recently used prompt_adapters will
+    be unloaded if the cache is above capacity."""
+
+    _prompt_adapter_manager_cls: Type[
+        LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
+
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
+        self._adapter_manager: LRUCachePromptAdapterModelManager = (
+            prompt_adapter_manager)
+        return prompt_adapter_manager.model
+
+    def _apply_adapters(
+            self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
+        prompt_adapters_map = {
+            prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
+            for prompt_adapter_request in prompt_adapter_requests
+            if prompt_adapter_request
+        }
+        if len(prompt_adapters_map
+               ) > self._adapter_manager.prompt_adapter_slots:
+            raise RuntimeError(
+                f"Number of requested prompt_adapters "
+                f"({len(prompt_adapters_map)}) is greater "
+                "than the number of GPU prompt_adapter slots "
+                f"({self._adapter_manager.prompt_adapter_slots}).")
+        for prompt_adapter in prompt_adapters_map.values():
+            self.add_adapter(prompt_adapter)
+
+    def add_adapter(self,
+                    prompt_adapter_request: PromptAdapterRequest) -> bool:
+        if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
+        ):
+            # Remove before we load the new prompt_adapter to save memory
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                self._adapter_manager.remove_oldest_adapter()
+            prompt_adapter = self._load_adapter(prompt_adapter_request)
+            loaded = self._adapter_manager.add_adapter(prompt_adapter)
+        else:
+            # If the prompt_adapter is already loaded, just touch it to
+            # update its position in the caches
+            loaded = self._adapter_manager.get_adapter(
+                prompt_adapter_request.prompt_adapter_id) is not None
+        self._adapter_manager.activate_adapter(
+            prompt_adapter_request.prompt_adapter_id)
+        return loaded
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9d8a361353e26..638c870c04371 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -8,6 +8,10 @@
 from pydantic import Field
 from typing_extensions import Annotated
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 _SAMPLING_EPS = 1e-5
 
 
@@ -280,17 +284,30 @@ def _verify_greedy_sampling(self) -> None:
                              f"Got {self.best_of}.")
 
     def update_from_generation_config(
-            self, generation_config: Dict[str, Any]) -> None:
+            self,
+            generation_config: Dict[str, Any],
+            model_eos_token_id: Optional[int] = None) -> None:
         """Update if there are non-default values from generation_config"""
+
+        if model_eos_token_id is not None:
+            # Add the eos token id into the sampling_params to support
+            # min_tokens processing.
+            self.all_stop_token_ids.add(model_eos_token_id)
+
         # Update eos_token_id for generation
-        if (not self.ignore_eos) and (eos_ids :=
-                                      generation_config.get("eos_token_id")):
+        if (eos_ids := generation_config.get("eos_token_id")) is not None:
             # it can be either int or list of int
-            if isinstance(eos_ids, int):
-                eos_ids = [eos_ids]
-            original_stop_token_ids = set(self.stop_token_ids)
-            original_stop_token_ids.update(eos_ids)
-            self.stop_token_ids = list(original_stop_token_ids)
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+            if model_eos_token_id is not None:
+                # We don't need to include the primary eos_token_id in
+                # stop_token_ids since it's handled separately for stopping
+                # purposes.
+                eos_ids.discard(model_eos_token_id)
+            if eos_ids:
+                self.all_stop_token_ids.update(eos_ids)
+                if not self.ignore_eos:
+                    eos_ids.update(self.stop_token_ids)
+                    self.stop_token_ids = list(eos_ids)
 
     @cached_property
     def sampling_type(self) -> SamplingType:
diff --git a/vllm/scripts.py b/vllm/scripts.py
new file mode 100644
index 0000000000000..3f334be925ee8
--- /dev/null
+++ b/vllm/scripts.py
@@ -0,0 +1,154 @@
+# The CLI entrypoint to vLLM.
+import argparse
+import os
+import signal
+import sys
+from typing import Optional
+
+from openai import OpenAI
+
+from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+def registrer_signal_handlers():
+
+    def signal_handler(sig, frame):
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def serve(args: argparse.Namespace) -> None:
+    # EngineArgs expects the model name to be passed as --model.
+    args.model = args.model_tag
+
+    run_server(args)
+
+
+def interactive_cli(args: argparse.Namespace) -> None:
+    registrer_signal_handlers()
+
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+
+    print(f"Using model: {model_name}")
+
+    if args.command == "complete":
+        complete(model_name, openai_client)
+    elif args.command == "chat":
+        chat(args.system_prompt, model_name, openai_client)
+
+
+def complete(model_name: str, client: OpenAI) -> None:
+    print("Please enter prompt to complete:")
+    while True:
+        input_prompt = input("> ")
+
+        completion = client.completions.create(model=model_name,
+                                               prompt=input_prompt)
+        output = completion.choices[0].text
+        print(output)
+
+
+def chat(system_prompt: Optional[str], model_name: str,
+         client: OpenAI) -> None:
+    conversation = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+
+    print("Please enter a message for the chat model:")
+    while True:
+        input_message = input("> ")
+        message = {"role": "user", "content": input_message}
+        conversation.append(message)
+
+        chat_completion = client.chat.completions.create(model=model_name,
+                                                         messages=conversation)
+
+        response_message = chat_completion.choices[0].message
+        output = response_message.content
+
+        conversation.append(response_message)
+        print(output)
+
+
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+
+
+def main():
+    parser = FlexibleArgumentParser(description="vLLM CLI")
+    subparsers = parser.add_subparsers(required=True)
+
+    serve_parser = subparsers.add_parser(
+        "serve",
+        help="Start the vLLM OpenAI Compatible API server",
+        usage="vllm serve <model_tag> [options]")
+    serve_parser.add_argument("model_tag",
+                              type=str,
+                              help="The model tag to serve")
+    serve_parser = make_arg_parser(serve_parser)
+    serve_parser.set_defaults(dispatch_function=serve)
+
+    complete_parser = subparsers.add_parser(
+        "complete",
+        help=("Generate text completions based on the given prompt "
+              "via the running API server"),
+        usage="vllm complete [options]")
+    _add_query_options(complete_parser)
+    complete_parser.set_defaults(dispatch_function=interactive_cli,
+                                 command="complete")
+
+    chat_parser = subparsers.add_parser(
+        "chat",
+        help="Generate chat completions via the running API server",
+        usage="vllm chat [options]")
+    _add_query_options(chat_parser)
+    chat_parser.add_argument(
+        "--system-prompt",
+        type=str,
+        default=None,
+        help=("The system prompt to be added to the chat template, "
+              "used for models that support system prompts."))
+    chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
+
+    args = parser.parse_args()
+    # One of the sub commands should be executed.
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 287e1b9df6165..0cd4c7e71d78d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,20 +1,23 @@
 """Sequence and its related classes."""
 import copy
 import enum
+import math
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple,
+                    Union)
 
 import torch
 
-from vllm.block import LogicalTokenBlock
-from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.inputs import LLMInputs
+    from vllm.multimodal import MultiModalDataDict
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -39,24 +42,21 @@ class Logprob:
 SampleLogprobs = List[Dict[int, Logprob]]
 
 
-class SequenceStatus(enum.Enum):
+class SequenceStatus(enum.IntEnum):
     """Status of a sequence."""
-    WAITING = enum.auto()
-    RUNNING = enum.auto()
-    SWAPPED = enum.auto()
-    FINISHED_STOPPED = enum.auto()
-    FINISHED_LENGTH_CAPPED = enum.auto()
-    FINISHED_ABORTED = enum.auto()
-    FINISHED_IGNORED = enum.auto()
+    WAITING = 0
+    RUNNING = 1
+    SWAPPED = 2
+    # Note: anything after SWAPPED (2) will be considered
+    # as a finished status.
+    FINISHED_STOPPED = 3
+    FINISHED_LENGTH_CAPPED = 4
+    FINISHED_ABORTED = 5
+    FINISHED_IGNORED = 6
 
     @staticmethod
     def is_finished(status: "SequenceStatus") -> bool:
-        return status in [
-            SequenceStatus.FINISHED_STOPPED,
-            SequenceStatus.FINISHED_LENGTH_CAPPED,
-            SequenceStatus.FINISHED_ABORTED,
-            SequenceStatus.FINISHED_IGNORED,
-        ]
+        return status > SequenceStatus.SWAPPED
 
     @staticmethod
     def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
@@ -119,41 +119,66 @@ def __init__(
         prompt_token_ids: List[int],
         output_token_ids: Optional[List[int]] = None,
     ) -> None:
-        if output_token_ids is None:
-            output_token_ids = []
+        self._prompt_token_ids: List[int] = list(prompt_token_ids)
+        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
+        self._output_token_ids: List[int] = (
+            list(output_token_ids) if output_token_ids is not None else [])
 
-        self.prompt_token_ids = prompt_token_ids
-        self._prompt_token_ids_tuple = tuple(prompt_token_ids)
-        self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
         self._num_computed_tokens = 0
         self._stage: SequenceStage = SequenceStage.PREFILL
 
+        self._update_cached_all_tokens()
+
+    def _update_cached_all_tokens(self):
+        self._cached_all_token_ids: List[int] = (self._prompt_token_ids +
+                                                 self._output_token_ids)
+
+    @property
+    def prompt_token_ids(self) -> Tuple[int, ...]:
+        return self._prompt_token_ids_tuple
+
+    @prompt_token_ids.setter
+    def prompt_token_ids(self, new_prompt_token_ids) -> None:
+        self._prompt_token_ids = list(new_prompt_token_ids)
+        self._prompt_token_ids_tuple = tuple(new_prompt_token_ids)
+        self._update_cached_all_tokens()
+
+    @property
+    def output_token_ids(self) -> Tuple[int, ...]:
+        return tuple(self._output_token_ids)
+
+    @output_token_ids.setter
+    def output_token_ids(self, new_output_token_ids) -> None:
+        self._output_token_ids = list(new_output_token_ids)
+        self._update_cached_all_tokens()
+
     def append_token_id(self, token_id: int, logprob: float) -> None:
-        self.output_token_ids.append(token_id)
+        self._output_token_ids.append(token_id)
+        self._cached_all_token_ids.append(token_id)
         self.cumulative_logprob += logprob
 
     def get_len(self) -> int:
-        return len(self.output_token_ids) + len(self.prompt_token_ids)
+        return len(self._output_token_ids) + len(self._prompt_token_ids)
 
     def get_prompt_len(self) -> int:
-        return len(self.prompt_token_ids)
+        return len(self._prompt_token_ids)
 
     def get_output_len(self) -> int:
-        return len(self.output_token_ids)
+        return len(self._output_token_ids)
 
     def get_token_ids(self) -> List[int]:
-        return self.prompt_token_ids + self.output_token_ids
+        return self._cached_all_token_ids
 
     def get_prefix_token_ids(
             self, num_tokens: int
     ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
         """Get prefix tokens, and make the return value hashable"""
-        prompt_length = len(self.prompt_token_ids)
+        prompt_length = self.get_prompt_len()
         if num_tokens > prompt_length:
             return (self._prompt_token_ids_tuple,
-                    tuple(self.output_token_ids[:num_tokens - prompt_length]))
+                    tuple(self._output_token_ids[:num_tokens - prompt_length]))
         else:
             return (self._prompt_token_ids_tuple[:num_tokens], None)
 
@@ -186,14 +211,14 @@ def get_num_uncomputed_tokens(self) -> int:
         return self.get_len() - self.get_num_computed_tokens()
 
     def get_last_token_id(self) -> int:
-        if not self.output_token_ids:
-            return self.prompt_token_ids[-1]
-        return self.output_token_ids[-1]
+        if not self._output_token_ids:
+            return self._prompt_token_ids[-1]
+        return self._output_token_ids[-1]
 
-    def get_prompt_token_ids(self) -> List[int]:
+    def get_prompt_token_ids(self) -> Tuple[int, ...]:
         return self.prompt_token_ids
 
-    def get_output_token_ids(self) -> List[int]:
+    def get_output_token_ids(self) -> Tuple[int, ...]:
         return self.output_token_ids
 
     @property
@@ -202,8 +227,8 @@ def stage(self) -> SequenceStage:
 
     def __repr__(self) -> str:
         return (f"SequenceData("
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"output_token_ids={self.output_token_ids}, "
+                f"prompt_token_ids={self._prompt_token_ids}, "
+                f"output_token_ids={self._output_token_ids}, "
                 f"cumulative_logprob={self.cumulative_logprob})")
 
 
@@ -216,29 +241,30 @@ class Sequence:
         block_size: The block size of the sequence. Should be the same as the
             block size used by the block manager and cache engine.
         lora_request: LoRA request.
+        prompt_adapter_request: Prompt Adapter request.
+
     """
 
     def __init__(
-        self,
-        seq_id: int,
-        inputs: LLMInputs,
-        block_size: int,
-        eos_token_id: Optional[int] = None,
-        lora_request: Optional[LoRARequest] = None,
+            self,
+            seq_id: int,
+            inputs: "LLMInputs",
+            block_size: int,
+            eos_token_id: Optional[int] = None,
+            lora_request: Optional[LoRARequest] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> None:
         self.seq_id = seq_id
         self.inputs = inputs
         self.block_size = block_size
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
+        self.prompt_adapter_request = prompt_adapter_request
 
         self.data = SequenceData(self.prompt_token_ids)
         self.output_logprobs: SampleLogprobs = []
         self.output_text = ""
 
-        self.logical_token_blocks: List[LogicalTokenBlock] = []
-        # Initialize the logical token blocks with the prompt token ids.
-        self._append_tokens_to_blocks(self.prompt_token_ids)
         self.status = SequenceStatus.WAITING
         self.stop_reason: Union[int, str, None] = None
 
@@ -248,6 +274,10 @@ def __init__(
         # Input + output tokens
         self.tokens: Optional[List[str]] = None
 
+    @property
+    def n_blocks(self) -> int:
+        return math.ceil(self.get_len() / self.block_size)
+
     @property
     def prompt(self) -> Optional[str]:
         return self.inputs.get("prompt")
@@ -257,13 +287,18 @@ def prompt_token_ids(self) -> List[int]:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
-        return self.inputs.get("multi_modal_data")
+    def multi_modal_data(self) -> "MultiModalDataDict":
+        return self.inputs.get("multi_modal_data") or {}
 
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
     def get_output_text_to_return(self, buffer_length: int):
         # We return the full output text if the sequence is finished.
         truncate = buffer_length and not self.is_finished()
@@ -287,36 +322,12 @@ def reset_state_for_recompute(self):
         """Reset the sequence states for recomputation."""
         self.data.reset_state_for_recompute()
 
-    def _append_logical_block(self) -> None:
-        block = LogicalTokenBlock(
-            block_number=len(self.logical_token_blocks),
-            block_size=self.block_size,
-        )
-        self.logical_token_blocks.append(block)
-
-    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
-        cursor = 0
-        while cursor < len(token_ids):
-            if not self.logical_token_blocks:
-                self._append_logical_block()
-
-            last_block = self.logical_token_blocks[-1]
-            if last_block.is_full():
-                self._append_logical_block()
-                last_block = self.logical_token_blocks[-1]
-
-            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append_tokens(token_ids[cursor:cursor +
-                                               num_empty_slots])
-            cursor += num_empty_slots
-
     def append_token_id(
         self,
         token_id: int,
         logprobs: Dict[int, Logprob],
     ) -> None:
         assert token_id in logprobs
-        self._append_tokens_to_blocks([token_id])
         self.output_logprobs.append(logprobs)
         self.data.append_token_id(token_id, logprobs[token_id].logprob)
 
@@ -332,14 +343,14 @@ def get_output_len(self) -> int:
     def get_token_ids(self) -> List[int]:
         return self.data.get_token_ids()
 
-    def get_prompt_token_ids(self) -> List[int]:
+    def get_prompt_token_ids(self) -> Tuple[int, ...]:
         return self.data.get_prompt_token_ids()
 
     def get_last_token_id(self) -> int:
         return self.data.get_last_token_id()
 
-    def get_output_token_ids(self) -> List[int]:
-        return self.data.output_token_ids
+    def get_output_token_ids(self) -> Tuple[int, ...]:
+        return self.data.get_output_token_ids()
 
     def get_cumulative_logprob(self) -> float:
         return self.data.cumulative_logprob
@@ -388,7 +399,7 @@ def is_prefill(self) -> bool:
     def __repr__(self) -> str:
         return (f"Sequence(seq_id={self.seq_id}, "
                 f"status={self.status.name}, "
-                f"num_blocks={len(self.logical_token_blocks)})")
+                f"num_blocks={self.n_blocks}, ")
 
 
 @dataclass
@@ -415,6 +426,7 @@ class SequenceGroup:
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
+        prompt_adapter_request: Prompt Adapter request.
     """
 
     def __init__(
@@ -427,7 +439,8 @@ def __init__(
         embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
         encoder_seq: Optional[Sequence] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -442,31 +455,43 @@ def __init__(
         self.state = SequenceGroupState()
         self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
+        self._first_seq = next(iter(self.seqs_dict.values()))
 
     @property
     def prompt(self) -> Optional[str]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).prompt
+        return self._first_seq.prompt
 
     @property
     def prompt_token_ids(self) -> List[int]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).prompt_token_ids
+        return self._first_seq.prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
+    def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).multi_modal_data
+        return self._first_seq.multi_modal_data
 
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
+    @property
+    def prompt_adapter_num_virtual_tokens(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
+                         if self.prompt_adapter_request else 0
+
     def get_last_latency(self, now: float) -> Optional[float]:
         """Sets the last token time for Request level timings."""
         # If still in prefill phase, raise Error.
@@ -625,6 +650,7 @@ class SequenceGroupMetadata:
                            (SequenceGroup.encoder_seq). Should be None
                            unless you are working with an encoder/decoder
                            model.
+        prompt_adapter_request: Prompt Adapter request.
     """
 
     def __init__(
@@ -640,9 +666,10 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional["MultiModalData"] = None,
+        multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -651,6 +678,7 @@ def __init__(
         self.block_tables = block_tables
         self.pooling_params = pooling_params
         self.lora_request = lora_request
+        self.prompt_adapter_request = prompt_adapter_request
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
         self.state = SequenceGroupState() if state is None else state
@@ -675,6 +703,16 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
+    @property
+    def prompt_adapter_num_virtual_tokens(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
+                        if self.prompt_adapter_request else 0
+
     @property
     def token_chunk_size(self) -> int:
         """Return the number of tokens to be processed (chunk size)."""
@@ -771,6 +809,34 @@ def __eq__(self, other: object) -> bool:
         return self.embeddings == other.embeddings
 
 
+@dataclass
+class IntermediateTensors:
+    """For all pipeline stages except the last, we need to return the hidden
+    states and residuals to be sent to the next stage. This data structure
+    contains the hidden states and residuals for a request.
+    """
+
+    tensors: Dict[str, torch.Tensor]
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value):
+        self.tensors[key] = value
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"IntermediateTensors(tensors={self.tensors})"
+
+
 @dataclass
 class SamplerOutput:
     """For each sequence group, we generate a list of SequenceOutput object,
@@ -853,6 +919,21 @@ def get_all_seq_ids(
     return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
 
 
+def get_all_seq_ids_and_request_ids(
+    seq_group_metadata_list: List[SequenceGroupMetadata]
+) -> Tuple[List[int], Dict[str, Set[int]]]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    seq_ids: List[int] = []
+    request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    for sg in seq_group_metadata_list:
+        for seq_id in sg.seq_data:
+            seq_ids.append(seq_id)
+            request_id_seq_ids_mapping[sg.request_id].add(seq_id)
+    return seq_ids, request_id_seq_ids_mapping
+
+
 class HiddenStates:
     """Hidden states corresponding to in-progress sequences.
     Used in speculative decoding to pass hidden states from
@@ -887,7 +968,8 @@ def prune(self,
 
 @dataclass
 class ExecuteModelRequest:
-    """The model execution request."""
+    """The model execution request, containing CPU metadata only. The LLM
+    engine should create an instance of this class for each request batch."""
     # The sequence group metadata list.
     seq_group_metadata_list: List[SequenceGroupMetadata]
     # Blocks to swap in. List of CPU -> GPU block number.
@@ -896,12 +978,18 @@ class ExecuteModelRequest:
     blocks_to_swap_out: List[Tuple[int, int]] = field(default_factory=list)
     # Blocks to copy. Source to dest block.
     blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list)
+    # Virtual engine ID for pipeline parallel.
+    virtual_engine: int = 0
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int = 0
     # The number of requests in the running queue.
     running_queue_size: int = 0
     # Optional hidden states from prior step.
     previous_hidden_states: Optional[HiddenStates] = None
+    # The number of forward steps to run.
+    num_steps: int = 1
+    # Finished request ids since last step.
+    finished_requests_ids: List[str] = field(default_factory=list)
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -912,7 +1000,9 @@ def clone(
             blocks_to_swap_in=self.blocks_to_swap_in.copy(),
             blocks_to_swap_out=self.blocks_to_swap_out.copy(),
             blocks_to_copy=self.blocks_to_copy.copy(),
+            virtual_engine=self.virtual_engine,
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
-        )
+            num_steps=self.num_steps,
+            finished_requests_ids=self.finished_requests_ids)
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 40516556344e9..41f0aebf3c01b 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -4,7 +4,8 @@
 import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata, get_all_seq_ids)
+                           SequenceGroupMetadata, SequenceGroupState,
+                           get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
@@ -292,6 +293,15 @@ def _create_single_target_seq_group_metadata(
         for data in new_seq_data_dict.values():
             data.update_num_computed_tokens(data.get_len() - 1)
 
+        if (seq_group_metadata.state is not None
+                and seq_group_metadata.state.generator is not None):
+            generator = torch.Generator(
+                device=seq_group_metadata.state.generator.device)
+            generator.set_state(seq_group_metadata.state.generator.get_state())
+            state = SequenceGroupState(generator=generator)
+        else:
+            state = None
+
         return SequenceGroupMetadata(
             request_id=seq_group_metadata.request_id,
             is_prompt=seq_group_metadata.is_prompt,
@@ -302,6 +312,7 @@ def _create_single_target_seq_group_metadata(
             },
             lora_request=None,
             token_chunk_size=1,
+            state=state,
         )
 
     def _split_scoring_output(
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
new file mode 100644
index 0000000000000..95071ecb6c8da
--- /dev/null
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -0,0 +1,344 @@
+from typing import List, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+
+try:
+    from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+except ModuleNotFoundError:
+    # vllm_flash_attn is not installed, use the identical ROCm FA metadata
+    from vllm.attention.backends.rocm_flash_attn import (
+        ROCmFlashAttentionMetadata as FlashAttentionMetadata)
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SamplerOutput)
+from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
+                                      ModelRunner)
+
+logger = init_logger(__name__)
+
+# A flag to enable debug prints for the updated input tensors
+# before each step.
+debug_advance_input = False
+# A flag to allow GPU advance step for draft model runner.
+# Set to False for debugging.
+allow_gpu_advance_step = True
+
+
+class TP1DraftModelRunner(ModelRunner):
+    """Specialized model runner for speculative decoding draft model.
+    Since the draft model always execute k forward passes consecutively to
+    generate k speculative tokens in a single speculative decoding step,
+    we could get rid of most CPU-GPU synchronization and data transfer
+    overheads by keeping model input and output tensors on GPU all the time.
+
+    TODOs:
+    1. Currently supports only flash-attn, add support for other attn_backends.
+    2. Support TP > 1 (this requires some designs because we do not expect
+       any broadcasting inside execute_model).
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        multimodal_config: Optional[MultiModalConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+        return_hidden_states: bool = False,
+    ):
+        if return_hidden_states:
+            raise ValueError(
+                "return_hidden_states is not supported for TP1DraftModelRunner."
+            )
+
+        super().__init__(
+            model_config=model_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            cache_config=cache_config,
+            load_config=load_config,
+            lora_config=lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            multimodal_config=multimodal_config,
+            prompt_adapter_config=prompt_adapter_config,
+            return_hidden_states=return_hidden_states,
+        )
+
+    def _update_flash_attn_metadata(self, attn_metadata, num_seqs,
+                                    num_queries):
+        assert isinstance(attn_metadata, FlashAttentionMetadata)
+
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert attn_metadata.use_cuda_graph
+
+        assert attn_metadata.num_prefills == 0
+        assert attn_metadata.num_prefill_tokens == 0
+        assert attn_metadata.num_decode_tokens == num_seqs
+        assert attn_metadata.slot_mapping.shape == (num_seqs, )
+
+        assert len(attn_metadata.seq_lens) == num_seqs
+        assert attn_metadata.seq_lens_tensor.shape == (num_seqs, )
+        assert attn_metadata.max_query_len == 1
+        assert attn_metadata.max_prefill_seq_len == 0
+        assert attn_metadata.max_decode_seq_len == max(attn_metadata.seq_lens)
+
+        assert attn_metadata.query_start_loc.shape == (num_queries + 1, )
+        assert attn_metadata.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert attn_metadata.context_lens_tensor.shape == (num_queries, )
+
+        assert attn_metadata.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            attn_metadata.seq_lens[i] += 1
+        attn_metadata.max_decode_seq_len = max(attn_metadata.seq_lens)
+
+    def _update_sampling_metadata(self, sampling_metadata, num_seqs,
+                                  num_queries):
+
+        assert sampling_metadata.num_prompts == 0
+        assert len(sampling_metadata.seq_groups) == num_queries
+        assert sampling_metadata.selected_token_indices.shape == (
+            num_queries, )
+        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
+
+        # Verify that all sequences are decodes
+        for i in range(num_queries):
+            seq_group = sampling_metadata.seq_groups[i]
+
+            assert seq_group.is_prompt is False  # No prompt
+            assert seq_group.prompt_logprob_indices == []  # No prompt
+            assert seq_group.sample_indices == [i]  # Simple
+            assert seq_group.seq_len is None  # Decode
+            assert seq_group.query_len is None  # Decode
+
+    def _gpu_advance_step(
+            self, model_input: ModelInputForGPUWithSamplingMetadata,
+            last_output: SamplerOutput
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        # Currently, we expect "decode mode" only
+        assert not model_input.is_prompt
+
+        # Get num_seqs
+        num_seqs = len(model_input.seq_lens)
+        num_queries = len(model_input.query_lens)
+
+        # Get output tokens GPU tensor
+        sampled_token_ids = last_output.sampled_token_ids
+        assert sampled_token_ids is not None
+
+        # Update attn_metadata
+        attn_metadata = model_input.attn_metadata
+        assert isinstance(attn_metadata, FlashAttentionMetadata)
+        self._update_flash_attn_metadata(attn_metadata, num_seqs, num_queries)
+
+        # Update GPU tensors
+        ops.advance_step(num_seqs=num_seqs,
+                         num_queries=num_queries,
+                         block_size=self.block_size,
+                         input_tokens=model_input.input_tokens,
+                         sampled_token_ids=sampled_token_ids,
+                         input_positions=model_input.input_positions,
+                         seq_lens=attn_metadata.seq_lens_tensor,
+                         slot_mapping=attn_metadata.slot_mapping,
+                         block_tables=attn_metadata.block_tables)
+
+        # Update sampling_metadata
+        sampling_metadata = model_input.sampling_metadata
+        self._update_sampling_metadata(sampling_metadata, num_seqs,
+                                       num_queries)
+
+        # Create new input
+        new_model_input = self._model_input_cls(
+            input_tokens=model_input.input_tokens,
+            input_positions=model_input.input_positions,
+            attn_metadata=attn_metadata,
+            seq_lens=attn_metadata.seq_lens,
+            query_lens=model_input.query_lens,
+            lora_mapping=model_input.lora_mapping,
+            lora_requests=model_input.lora_requests,
+            multi_modal_kwargs=model_input.multi_modal_kwargs,
+            sampling_metadata=model_input.sampling_metadata,
+            is_prompt=False,
+        )
+
+        # Ensure we skip CPU samples
+        assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True
+        # We can reuse sampling tensors since every decode iteration is the same
+        new_model_input.sampling_metadata.reuse_sampling_tensors = True
+
+        if debug_advance_input:
+            logger.debug("NEW INPUT: ")
+            logger.debug("  input_tokens = %s", new_model_input.input_tokens)
+            logger.debug("  input_positions = %s",
+                         new_model_input.input_positions)
+            logger.debug("  seq_lens = %d", new_model_input.seq_lens)
+            logger.debug("  query_lens = %d", new_model_input.query_lens)
+            logger.debug("  attn_metadata:")
+            logger.debug("    seq_lens_tensor: %s",
+                         attn_metadata.seq_lens_tensor)
+            logger.debug("    slot_mapping: %s", attn_metadata.slot_mapping)
+            logger.debug("    block_tables: %s", attn_metadata.block_tables)
+
+        return new_model_input
+
+    def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
+        """Determines if draft_model_runner GPU multi-step can be used.
+        Currently required conditions are:
+            1. Only decodes 
+            2. Only flash-attn
+            3. No LORA
+            4. No prompt_adapter_config
+        """
+        if not allow_gpu_advance_step:
+            return False
+
+        # We allow multi-step GPU only in decode mode
+        for seq_group in execute_model_req.seq_group_metadata_list:
+            if seq_group.is_prompt:
+                return False
+
+        # TODO: Add support for other attn backends
+        if self.attn_backend.get_name() != "flash-attn":
+            return False
+
+        # TODO: Add support for LORA
+        if self.lora_config:
+            return False
+
+        # TODO: Add soft-tuning prompt adapter support
+        if self.prompt_adapter_config:
+            return False
+
+        return True
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes num_steps forward passes with advacement of input tensors 
+        on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
+
+        Optimizations used:
+            1. Input tensors are updated on the GPU directly
+            2. Skips GPU=>CPU serialization of sampler outputs (we don't need 
+                them since we do batch expansion later that uses GPU outputs)
+            3. Reuses sampling tensors (since we run only decodes and they have
+                a repeating sampling logic)
+        """
+
+        # When num_steps == 1, we execute the fallback here for the GPU
+        # advance_step, which runs prepare_inputs on CPU and for each spec
+        # iteration invokes this function only once
+        # (Look at multi-step-worker code)
+        is_fallback = num_steps == 1
+        if not is_fallback:
+            # Since we do not broadcast data inside execute_model anymore,
+            # we need to figure out the best way to support TP > 1 in this
+            # case, because we will at least need to broadcast the sampled
+            # tokens to all workers.
+            if not self.is_driver_worker:
+                raise ValueError("TP1DraftModelRunner only supports TP=1.")
+
+            # Sanity
+            if self.lora_config is not None:
+                raise ValueError("TP1DraftModelRunner has no support for LORA")
+            if self.prompt_adapter_config is not None:
+                raise ValueError("TP1DraftModelRunner has no support for "
+                                 "prompt_adapter_config")
+            if model_input.multi_modal_kwargs:
+                raise ValueError(
+                    "TP1DraftModelRunner has no support for multi_modal_kwargs"
+                )
+        else:
+            if self.lora_config:
+                assert model_input.lora_requests is not None
+                assert model_input.lora_mapping is not None
+                self.set_active_loras(model_input.lora_requests,
+                                      model_input.lora_mapping)
+
+            if self.prompt_adapter_config:
+                assert model_input.prompt_adapter_requests is not None
+                assert model_input.prompt_adapter_mapping is not None
+                self.set_active_prompt_adapters(
+                    model_input.prompt_adapter_requests,
+                    model_input.prompt_adapter_mapping)
+
+        # Detect exec mode
+        assert model_input.attn_metadata is not None
+        use_cuda_graph = False
+        if model_input.attn_metadata.num_prefills > 0:
+            # In this case, execute_model(..) was called directly
+            if num_steps > 1:
+                raise ValueError(
+                    "execute_model(..) of draft_model_runner can be called "
+                    "directly only with a single-step prefill")
+        else:
+            # We can skip CPU samples for spec token generation.
+            # (We do allow CPU samples for num_steps == 1 to support the
+            # fallback case, where supports_gpu_multi_step(..) does not pass)
+            model_input.sampling_metadata.skip_sampler_cpu_output = (
+                not is_fallback)
+
+            # Attn attr defines if we use cuda graphs
+            use_cuda_graph = model_input.attn_metadata.use_cuda_graph
+
+        # Get model
+        if use_cuda_graph:
+            graph_batch_size = model_input.input_tokens.shape[0]
+            model_executable = (self.graph_runners[model_input.virtual_engine]
+                                [graph_batch_size])
+        else:
+            model_executable = self.model
+
+        outputs: List[SamplerOutput] = []
+        for step in range(num_steps):
+            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+
+            # Run model
+            hidden_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **multi_modal_kwargs,
+            )
+
+            # Compute the logits.
+            logits = self.model.compute_logits(hidden_states,
+                                               model_input.sampling_metadata)
+
+            # Sample the next token.
+            outputs.append(
+                self.model.sample(
+                    logits=logits,
+                    sampling_metadata=model_input.sampling_metadata,
+                ))
+
+            # Prepare inputs for the next step
+            if step != num_steps - 1:
+                model_input = self._gpu_advance_step(model_input, outputs[-1])
+
+        return outputs
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index d236fc0f2cb6b..11ab09f10c1f5 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Set
 
 import torch
 
@@ -22,6 +22,9 @@ class SpeculativeProposals:
     # The valid length of each proposal; can be zero.
     proposal_lens: torch.Tensor
 
+    # A flag to mark that there's no available proposals
+    no_proposals: bool = False
+
     def __repr__(self):
         return (f"SpeculativeProposals("
                 f"proposal_token_ids={self.proposal_token_ids}, "
@@ -62,6 +65,9 @@ class SpeculativeProposer(ABC):
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        # If set, this contains all sequence IDs that were assigned
+        # bonus tokens in their last forward pass.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         raise NotImplementedError
 
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
new file mode 100644
index 0000000000000..041ce41e91d05
--- /dev/null
+++ b/vllm/spec_decode/medusa_worker.py
@@ -0,0 +1,131 @@
+import weakref
+from typing import List, Optional, Set, Tuple
+
+import torch
+
+from vllm.model_executor import SamplingMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
+from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.worker.worker import Worker
+
+
+class MedusaWorker(NonLLMProposerWorkerBase, Worker):
+    """Worker for Medusa.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Lazy initialization list.
+        self._proposer: Top1Proposer
+
+    def init_device(self):
+        super().init_device()
+
+        self._proposer = Top1Proposer(
+            weakref.proxy(self),  # type: ignore[arg-type]
+            self.device,
+            self.vocab_size,
+            max_proposal_len=self.max_model_len,
+        )
+
+    def set_include_gpu_probs_tensor(self):
+        pass
+
+    @torch.inference_mode()
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+        # Unused parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
+    ) -> Tuple[List[SamplerOutput], bool]:
+        """Run the model forward pass to generate sample_len future tokens.
+        Returns the list of sampler output, one per layer, along with indicator
+        of whether torch tensor in sampler output need to be transposed in
+        latter sampler_output_to_torch logic.
+
+        For medusa worker, this indicator shall be False.
+        """
+        self._raise_if_unsupported(execute_model_req)
+
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        seq_lens, query_lens = self._prepare_input_tensors(
+            seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list, seq_lens, query_lens, self.device,
+            self.model_runner.pin_memory)
+
+        model_outputs = self.model_runner.model.generate_proposals(
+            previous_hidden_states=execute_model_req.previous_hidden_states.
+            hidden_states,
+            sampling_metadata=sampling_metadata)
+
+        return model_outputs, False
+
+    def _prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[List[int], List[int]]:
+        if not seq_group_metadata_list:
+            return [], []
+
+        seq_lens: List[int] = []
+        query_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_data in seq_group_metadata.seq_data.values():
+                seq_data_len = seq_data.get_len()
+                if is_prompt:
+                    context_len = seq_data.get_num_computed_tokens()
+                    seq_len = min(
+                        seq_data_len,
+                        context_len + seq_group_metadata.token_chunk_size)
+                    seq_lens.append(seq_len)
+                    query_lens.append(seq_len - context_len)
+                else:
+                    seq_lens.append(seq_data_len)
+                    query_lens.append(1)
+
+        return seq_lens, query_lens
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
+
+    def _raise_if_unsupported(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> None:
+        """MedusaWorker does not yet implement support for cache swap
+        operations or beam search.
+        """
+        if any([
+                execute_model_req.blocks_to_swap_in,
+                execute_model_req.blocks_to_swap_out,
+                execute_model_req.blocks_to_copy
+        ]):
+            raise NotImplementedError(
+                "MedusaWorker does not support cache operations")
+
+        if any(
+                len(seq_group_metadata.seq_data.keys()) != 1
+                for seq_group_metadata in
+                execute_model_req.seq_group_metadata_list):
+            raise NotImplementedError(
+                "MedusaWorker does not support beam search.")
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index ab1d96c558de7..9036d117041f0 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
 from vllm.utils import is_pin_memory_available
 
 
@@ -46,15 +47,15 @@ class SpecDecodeWorkerMetrics:
 
 
 class AsyncMetricsCollector:
-    """Class which copies rejection sampler metrics from the device to CPU on a
-    non-default Torch stream.
+    """Class which copies rejection/typical-acceptance sampler metrics
+    from the device to CPU on a non-default Torch stream.
     """
 
     def __init__(self,
-                 rejection_sampler: RejectionSampler,
+                 spec_decode_sampler: SpecDecodeBaseSampler,
                  timer: Optional[Timer] = None,
                  collect_interval_s: float = 5.0):
-        self._rejection_sampler = rejection_sampler
+        self.spec_decode_sampler = spec_decode_sampler
         self._timer = time.time if timer is None else timer
 
         self._rank: Optional[int] = None
@@ -95,7 +96,7 @@ def maybe_collect_rejsample_metrics(
         return None
 
     def _should_collect_rejsample_metrics(self, now: float) -> bool:
-        """Return whether or not this iteration should print rejection sampling
+        """Return whether or not this iteration should print sampling
         metrics.
         """
         if self._rank != 0:
@@ -107,8 +108,8 @@ def _should_collect_rejsample_metrics(self, now: float) -> bool:
         return True
 
     def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
-        """Copy rejection sampling metrics (number of accepted tokens, etc) to
-        CPU asynchronously.
+        """Copy rejection/typical-acceptance sampling metrics 
+        (number of accepted tokens, etc) to CPU asynchronously.
 
         Returns a CUDA event recording when the copy is complete.
         """
@@ -117,13 +118,14 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
 
         with torch.cuda.stream(self._copy_stream):
             self._aggregate_num_accepted_tokens.copy_(
-                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_accepted_tokens,
+                non_blocking=True)
             self._aggregate_num_emitted_tokens.copy_(
-                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_emitted_tokens, non_blocking=True)
             # Number of draft tokens is calculated on CPU, so no copy is
             # required.
             self._aggregate_num_draft_tokens = (
-                self._rejection_sampler.num_draft_tokens)
+                self.spec_decode_sampler.num_draft_tokens)
 
         aggregate_metrics_ready = torch.cuda.Event()
         aggregate_metrics_ready.record(self._copy_stream)
@@ -143,6 +145,10 @@ def _collect_rejsample_metrics(
         """
 
         ready_event.synchronize()
+
+        # update time of last collection
+        self._last_metrics_collect_time = self._timer()
+
         accepted_tokens = self._aggregate_num_accepted_tokens.item()
         emitted_tokens = self._aggregate_num_emitted_tokens.item()
         draft_tokens = self._aggregate_num_draft_tokens
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 0926e13bedab1..308573348d443 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -7,7 +7,6 @@
                            SequenceGroupMetadata)
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
-from vllm.worker.model_runner import ModelInput
 
 
 class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
@@ -21,6 +20,9 @@ def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass to generate sample_len future tokens.
         Returns the list of sampler output, one per layer, along with indicator
@@ -56,7 +58,7 @@ def _prepare_input_tensors(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, List[int], List[int]]:
         if not seq_group_metadata_list:
-            return ModelInput.empty(self.device)
+            return torch.empty(0, device=self.device), [], []
 
         input_tokens: List[int] = []
         seq_lens: List[int] = []
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 668ceefe6175f..91689324557b5 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,12 +1,14 @@
 import copy
 import weakref
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
 
 import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -28,9 +30,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # Lazy initialization list.
-        self._proposer: Top1Proposer
+        self._proposer: SpeculativeProposer
 
-    def init_device(self):
+    def init_device(self) -> None:
         super().init_device()
 
         self._proposer = Top1Proposer(
@@ -40,8 +42,8 @@ def init_device(self):
             max_proposal_len=self.max_model_len,
         )
 
-    def set_include_gpu_probs_tensor(self):
-        # Need include_gpu_probs_tensor for multi_step_worker
+    def set_include_gpu_probs_tensor(self) -> None:
+        # Need include_gpu_probs_tensor for MultiStepWorker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
     @torch.inference_mode()
@@ -49,6 +51,7 @@ def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass sample_len times. Returns the list of
         sampler output, one per model forward pass, along with indicator of
@@ -58,42 +61,151 @@ def sampler_output(
         For multi step worker, this indicator shall be True.
         """
         self._raise_if_unsupported(execute_model_req)
-
-        # Shallow copy input data so modifications (such as appending tokens)
-        # do not cause side-effects.
-        copied_seq_group_metadata_list = self._shallow_copy_inputs(
-            execute_model_req.seq_group_metadata_list)
-        copied_execute_model_req = execute_model_req.clone(
-            copied_seq_group_metadata_list)
-
-        # Assert enough KV space for sample_len tokens per sequence.
-        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
-                                     sample_len)
+        # Expand the batch for sequences with a bonus token.
+        # Perform a forward pass on the expanded batch and filter the
+        # response to retain only the original sequences' responses.
+        expanded_request, indices_of_seq_with_bonus_tokens =\
+            self._expand_execute_model_request(
+                execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
-        for _ in range(sample_len):
-            model_output = super().execute_model(
-                execute_model_req=copied_execute_model_req)
-            assert (len(model_output) == 1
-                    ), "composing multistep workers not supported"
-            model_output = model_output[0]
+        if isinstance(
+                self.model_runner, TP1DraftModelRunner
+        ) and self.model_runner.supports_gpu_multi_step(expanded_request):
+            # Here we run the draft_model_runner with multi-step prepare
+            # on the GPU directly
+            expanded_request.num_steps = sample_len
+            model_outputs = self.execute_model(
+                execute_model_req=expanded_request)
+        else:
+            # Here we run multi-step directly, with every step prepared
+            # on the CPU.
+            # TODO: Remove this branch once DraftModelRunner supports TP>1
+            # and other restrictions that are part of DraftModelRunner's
+            # supports_gpu_multi_step(..)
+            for _ in range(sample_len):
+                model_output: List[SamplerOutput] = super().execute_model(
+                    execute_model_req=expanded_request)
+                assert (len(model_output) == 1
+                        ), "composing multistep workers not supported"
+                model_output = model_output[0]
+
+                self._append_new_tokens(
+                    model_output, expanded_request.seq_group_metadata_list)
+                model_outputs.append(model_output)
+
+        filtered_model_outputs = self._filter_model_output(
+            model_outputs, indices_of_seq_with_bonus_tokens)
+        return filtered_model_outputs, True
 
-            self._append_new_tokens(model_output,
-                                    copied_seq_group_metadata_list)
-            model_outputs.append(model_output)
+    @staticmethod
+    def _expand_execute_model_request(
+        execute_model_req: ExecuteModelRequest,
+        seq_with_bonus_token_in_last_step: set,
+    ) -> Tuple[ExecuteModelRequest, List[int]]:
+        """
+        Expands the execute model request based on sequences with bonus
+        tokens.
+
+        For each sequence with a bonus token, this method creates a new
+        sequence without the bonus token and adds it to the execute model
+        request. The original sequence groups are also retained. The indices
+        of the original sequence groups are returned for further processing.
+
+        Args:
+            execute_model_req (ExecuteModelRequest): The original execute
+            model request.
+            seq_with_bonus_token_in_last_step (set): Set of sequence IDs that 
+            contain bonus tokens.
+
+        Returns:
+            Tuple[ExecuteModelRequest, List[int]]: The updated execute model
+            request with expanded sequences and a list of indices corresponding
+            to the original sequence groups.
+        """
+        updated_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        updated_execute_model_req = execute_model_req.clone(
+            updated_seq_group_metadata_list)
+        indices_of_original_sequence_groups = []
+        for seq_group in execute_model_req.seq_group_metadata_list:
+            seq_group_has_bonus_tokens = False
+            for seq_id, _ in seq_group.seq_data.items():
+                # Identify sequences with bonus tokens in the sequence group.
+                if seq_id in seq_with_bonus_token_in_last_step:
+                    seq_group_has_bonus_tokens = True
+                    break
+            if seq_group_has_bonus_tokens:
+                #Create new sequences without the last bonus token. These new
+                # sequence have the same sequence id as the original sequence.
+                # We create a new sequence group and add them there.
+                updated_seq_group_without_bonus_token  = \
+                    MultiStepWorker._copy_seq_metadata_excluding_last_token(
+                        seq_group, seq_with_bonus_token_in_last_step)
+                updated_seq_group_metadata_list.append(
+                    updated_seq_group_without_bonus_token)
+            # Add the original sequence group.
+            updated_seq_group_metadata_list.append(
+                MultiStepWorker._shallow_copy_seq_group_metadata(seq_group))
+            # Record the index of the original sequence group.
+            indices_of_original_sequence_groups.append(
+                len(updated_seq_group_metadata_list) - 1)
+
+        updated_execute_model_req.seq_group_metadata_list =\
+            updated_seq_group_metadata_list
+        return updated_execute_model_req, indices_of_original_sequence_groups
 
-        return model_outputs, True
+    @staticmethod
+    def _filter_model_output(
+            expanded_batch_outputs: List[SamplerOutput],
+            output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+        """
+        Filters the model output to include only the specified sequence
+        outputs. This method contracts the expanded batch output from the
+        model to retain the outputs of only those sequences indicated by the
+        provided indices.
+
+        Args:
+            expanded_batch_output (List[SamplerOutput]): The expanded output
+                batch from the model.
+            output_indices_to_retain (List[int]): Indices of the model outputs
+                to retain.
+
+        Returns:
+            List[SamplerOutput]: A list containing the filtered model 
+            outputs for the specified indices.
+        """
+        return [
+            SamplerOutput(
+                outputs=[
+                    expanded_batch_output.outputs[i]
+                    for i in output_indices_to_retain
+                ] if len(expanded_batch_output.outputs) > 0 else [],
+                sampled_token_probs=(
+                    expanded_batch_output.
+                    sampled_token_probs[output_indices_to_retain]
+                    if expanded_batch_output.sampled_token_probs is not None
+                    else None),
+                logprobs=(
+                    expanded_batch_output.logprobs[output_indices_to_retain]
+                    if expanded_batch_output.logprobs is not None else None),
+                sampled_token_ids=(expanded_batch_output.
+                                   sampled_token_ids[output_indices_to_retain]
+                                   if expanded_batch_output.sampled_token_ids
+                                   is not None else None))
+            for expanded_batch_output in expanded_batch_outputs
+        ]
 
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: set,
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
-
-        return self._proposer.get_spec_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     @staticmethod
     def _append_new_tokens(
@@ -119,9 +231,8 @@ def _append_new_tokens(
                 seq.update_num_computed_tokens(1)
 
     @staticmethod
-    def _shallow_copy_inputs(
-        seq_group_metadata_list: List[SequenceGroupMetadata]
-    ) -> List[SequenceGroupMetadata]:
+    def _shallow_copy_seq_group_metadata(
+        seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata:
         """Copy input data structures to remove side-effects when input data
         structures are shared with other modules.
 
@@ -129,26 +240,62 @@ def _shallow_copy_inputs(
         The alternative is deep-copying (or other form of deep copy); this has
         performance downsides.
         """
-
-        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
+        # Shallow-copy the SequenceGroupMetadata. This allows us to
         # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        # We must shallow-copy seq_group_metadata as is_prompt could change.
+        new_seq_group_metadata = copy.copy(seq_group_metadata)
 
-        for old_seq_group_metadata in seq_group_metadata_list:
-            # We must shallow-copy seq_group_metadata as is_prompt could change.
-            seq_group_metadata = copy.copy(old_seq_group_metadata)
-            new_seq_group_metadata_list.append(seq_group_metadata)
-
-            # We must shallow-copy seq_data as we will append token ids
-            new_seq_data: Dict[int, SequenceData] = {}
-            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
-                new_seq_data[seq_id] = copy.copy(old_seq_data)
-                new_seq_data[
-                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+        # We must shallow-copy seq_data as we will append token ids
+        new_seq_data: Dict[int, SequenceData] = {}
+        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+            new_seq_data[seq_id] = copy.copy(old_seq_data)
+            new_seq_data[seq_id].output_token_ids =\
+                old_seq_data.output_token_ids[:]
 
-            seq_group_metadata.seq_data = new_seq_data
+        new_seq_group_metadata.seq_data = new_seq_data
+        return new_seq_group_metadata
 
-        return new_seq_group_metadata_list
+    @staticmethod
+    def _copy_seq_metadata_excluding_last_token(
+        seq_group_metadata: SequenceGroupMetadata,
+        seq_ids_to_copy: Set[int],
+    ) -> SequenceGroupMetadata:
+        """
+        Creates a shallow copy of the given SequenceGroupMetadata, retaining
+        only the sequence IDs specified in seq_ids_to_copy. For each of these
+        sequence IDs, all output_token_ids except the last one are copied.
+        Sequence IDs not in seq_ids_to_copy are excluded from the copy.
+        
+        Parameters:
+        seq_group_metadata (SequenceGroupMetadata): The original sequence
+            group metadata.
+        seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the
+            copy.
+        
+        Returns:
+        SequenceGroupMetadata: A shallow copy of the sequence group metadata
+            with the specified modifications.
+        """
+        # Shallow-copy the SequenceGroupMetadata.
+        new_seq_group_metadata = copy.copy(seq_group_metadata)
+        # Shallow-copy seq_data and modify the output_token_ids.
+        new_seq_data: Dict[int, SequenceData] = {}
+        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+            if (seq_id in seq_ids_to_copy):
+                new_seq_data[seq_id] = copy.copy(old_seq_data)
+                # Copy all the output token ids except the last.
+                # Also reduce num_computed_tokens by 1 since we are not
+                # including the last output token.
+                # NOTE: num_computed_tokens is not directly used by the
+                # speculative decoding workers, as it is only relevant for
+                # chunked prefill, which is disabled for speculative decoding.
+                # However, to maintain consistency in num_computed_tokens,
+                # we update it here.
+                new_seq_data[seq_id].output_token_ids =\
+                    old_seq_data.output_token_ids[:-1]
+                new_seq_data[seq_id].update_num_computed_tokens(-1)
+        new_seq_group_metadata.seq_data = new_seq_data
+        return new_seq_group_metadata
 
     def _assert_enough_kv_space(
             self, seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 23a3e1649914b..a21222fec269b 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,5 +1,5 @@
 import weakref
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -13,7 +13,7 @@
 class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
-    Current NGramWorker only implement prompt lookup decoding,
+    Current NGramWorker only implements prompt lookup decoding,
     and in future we may also do RAG type drafter and other scenarios
     which don't rely on LLM model to give proposals.
     """
@@ -37,7 +37,7 @@ def init_device(self):
         self.device = torch.device(f"cuda:{self.local_rank}")
         self.load_model = lambda *args, **kwargs: None
 
-        # Current only support Top1Proposer
+        # Current NGramWorker only supports Top1Proposer
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
             device=self.device,
@@ -48,6 +48,9 @@ def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # Unused parameter. NGramWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
         """NGram match algo to pick proposal candidate. Returns the list of
         sampler output, one per SequenceGroupMetadata.
@@ -133,12 +136,15 @@ def sampler_output(
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        # Unused parameter. NGramWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
-
-        return self._proposer.get_spec_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     def _raise_if_unsupported(
         self,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index fd67ceb912eee..51cefc0cbca8b 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,12 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposer
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class ProposerWorkerBase(WorkerBase, SpeculativeProposer):
+class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
     """Interface for proposer workers"""
 
     @abstractmethod
@@ -14,10 +14,17 @@ def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # A set containing all sequence IDs that were assigned bonus tokens
+        # in their last forward pass. This set is used to backfill the KV cache
+        # with the key-value pairs of the penultimate token in the sequences.
+        # This parameter is only used by the MultiStepWorker, which relies on
+        # the KV cache for token generation. It is not used by workers that
+        # do not utilize the KV cache.
+        seq_ids_with_bonus_token_in_last_step: Set[int]
     ) -> Tuple[Optional[List[SamplerOutput]], bool]:
         raise NotImplementedError
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         """Implementation optional"""
         pass
 
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
new file mode 100644
index 0000000000000..0dbb924d25400
--- /dev/null
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -0,0 +1,154 @@
+from typing import List, Optional, Set, Tuple
+
+import torch
+
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_model_parallel_group,
+                                             patch_tensor_parallel_group)
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+
+logger = init_logger(__name__)
+
+
+class SmallerTpProposerWorker(ProposerWorkerBase):
+    """Class which allows a speculative draft model to run with smaller tensor
+    parallel degree than target model.
+    This reduces the communication overhead of small draft models.
+
+    To implement this feature, this class differs behavior based on is_dummy
+    flag, where dummy means worker that does not participate draft generation.
+    Participating workers use a smaller tp group by patching vLLM's tensor
+    parallel group temporarily during forward passes of draft models.
+    """
+
+    @classmethod
+    def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
+                          target_tensor_parallel_size: int):
+        """Wrap the worker in a SmallerTpProposerWorker if necessary.
+        """
+        if draft_tensor_parallel_size == target_tensor_parallel_size:
+            return worker
+
+        # gpu ranks that will generate draft tokens together
+        draft_ranks = list(range(draft_tensor_parallel_size))
+
+        logger.info("Wrapping {%s} in {%s}", type(worker), cls)
+        return cls(worker, draft_ranks)
+
+    def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
+        """Create a SmallerTpProposerWorker.
+
+        Args:
+            worker (MultiStepWorker): an actual worker wrapped with this class
+            draft_ranks (List[int]): if this value is given, only the GPU ranks
+            written in this value participate in draft generation
+        """
+        self._worker = worker
+        self._draft_ranks = draft_ranks
+
+        # init during init_device
+        self._is_dummy = False
+        self._tp_group = None
+
+    def _patch_tensor_parallel_group(self):
+        """Temporarily patch the global tp group state with its own tp group
+        state.
+        """
+        return patch_tensor_parallel_group(self._tp_group)
+
+    def init_device(self) -> None:
+        self._is_dummy = get_tp_group().rank not in self._draft_ranks
+
+        # dummy workers do nothing
+        if self._is_dummy:
+            return
+
+        # creates tp process group containing only a subset of gpu ranks
+        local_rank = get_tp_group().local_rank
+        tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
+        self._tp_group = init_model_parallel_group([self._draft_ranks],
+                                                   local_rank, tp_backend)
+
+        with self._patch_tensor_parallel_group():
+            self._worker.init_device()
+
+    def set_include_gpu_probs_tensor(self) -> None:
+        if self._is_dummy:
+            return
+
+        # Need include_gpu_probs_tensor for multi_step_worker
+        self._worker.set_include_gpu_probs_tensor()
+
+    def load_model(self) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        if self._is_dummy:
+            # this case is not used now
+            return -1, -1
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
+    ) -> Tuple[List[SamplerOutput], bool]:
+        # Do not check _is_dummy, as it's always called by get_spec_proposals
+        return self._worker.sampler_output(
+            execute_model_req, sample_len,
+            seq_ids_with_bonus_token_in_last_step)
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+        if self._is_dummy:
+            return SpeculativeProposals(None, None, None)
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_spec_proposals(
+                execute_model_req, seq_ids_with_bonus_token_in_last_step)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if self._is_dummy:
+            return []
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        if self._is_dummy:
+            # by returning zero, target worker can use the entire kv cache space
+            return 0
+
+        return self._worker.get_cache_block_size_bytes()
+
+    @property
+    def vocab_size(self) -> int:
+        return self._worker.vocab_size
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 58d3461a25188..8cf0aa5b8981a 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,23 +1,32 @@
+from collections import defaultdict
 from functools import cached_property
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 
-from vllm.config import SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SamplerOutput, SequenceGroupMetadata,
-                           get_all_seq_ids)
+                           get_all_seq_ids, get_all_seq_ids_and_request_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
+from vllm.spec_decode.medusa_worker import MedusaWorker
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
+from vllm.spec_decode.target_model_runner import TargetModelRunner
 from vllm.spec_decode.util import (create_sequence_group_output,
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
@@ -36,9 +45,15 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     speculative_config: SpeculativeConfig = kwargs.get("speculative_config")
     assert speculative_config is not None
 
+    draft_worker_kwargs = kwargs.copy()
+
+    kwargs["model_runner_cls"] = TargetModelRunner
     target_worker = Worker(*args, **kwargs)
+    # Set the disable_logprobs variable in the TargetModelRunner instance
+    # as per its value specified in the SpeculativeConfig.
+    target_worker.model_runner.disable_logprobs =\
+         speculative_config.disable_logprobs
 
-    draft_worker_kwargs = kwargs.copy()
     # Override draft-model specific worker args.
     draft_worker_kwargs.update(
         model_config=speculative_config.draft_model_config,
@@ -54,7 +69,13 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         draft_worker_kwargs=draft_worker_kwargs,
         disable_by_batch_size=speculative_config.
         speculative_disable_by_batch_size,
-    )
+        draft_token_acceptance_method=speculative_config.
+        draft_token_acceptance_method,
+        typical_acceptance_sampler_posterior_threshold=speculative_config.
+        typical_acceptance_sampler_posterior_threshold,
+        typical_acceptance_sampler_posterior_alpha=speculative_config.
+        typical_acceptance_sampler_posterior_alpha,
+        disable_logprobs=speculative_config.disable_logprobs)
 
     return spec_decode_worker
 
@@ -76,8 +97,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         welcome!).
     * Only top-1 proposal and scoring are implemented. Tree-attention is left as
         future work.
-    * Only lossless rejection sampling is supported. Contributions adding lossy
-        verification routines are welcome (e.g. Medusa's typical acceptance).
     * All sequences in a batch must have the same proposal length, or zero. This
         can be improved by having per-sequence speculation in the future.
     * The scoring forward pass is done without an MQA kernel, which is
@@ -90,45 +109,81 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: WorkerBase,
+        scorer_worker: Worker,
         draft_worker_kwargs: Dict[str, Any],
         disable_by_batch_size: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
+        disable_logprobs: bool,
     ) -> "SpecDecodeWorker":
 
+        allow_zero_draft_token_step = True
         ngram_prompt_lookup_max = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
-
-        disable_bonus_tokens = True
         if ngram_prompt_lookup_max > 0:
-            disable_bonus_tokens = False
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
-        elif draft_worker_kwargs[
-                "model_config"].hf_config.model_type == "mlp_speculator":
-            proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
-            disable_bonus_tokens = False
         else:
-            proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+                'parallel_config']
+            draft_tp = draft_parallel_config.tensor_parallel_size
+            target_tp = scorer_worker.parallel_config.tensor_parallel_size
+
+            if draft_worker_kwargs[
+                    "model_config"].hf_config.model_type == "mlp_speculator":
+                proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
+            elif draft_worker_kwargs[
+                    "model_config"].hf_config.model_type == "medusa":
+                proposer_worker = MedusaWorker(**draft_worker_kwargs)
+            else:
+                if draft_tp == 1:
+                    draft_worker_kwargs[
+                        "model_runner_cls"] = TP1DraftModelRunner
+                else:
+                    allow_zero_draft_token_step = False
+                proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+
+            proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
+                proposer_worker, draft_tp, target_tp)
 
         logger.info("Configuring SpecDecodeWorker with proposer=%s",
                     type(proposer_worker))
 
-        return SpecDecodeWorker(proposer_worker,
-                                scorer_worker,
-                                disable_by_batch_size=disable_by_batch_size,
-                                rejection_sampler=RejectionSampler(
-                                    disable_bonus_tokens=disable_bonus_tokens))
+        spec_decode_sampler: SpecDecodeBaseSampler = None
+        if draft_token_acceptance_method == "rejection_sampler":
+            spec_decode_sampler = RejectionSampler(
+                disable_bonus_tokens=False, )
+        elif draft_token_acceptance_method == "typical_acceptance_sampler":
+            spec_decode_sampler = TypicalAcceptanceSampler(
+                disable_bonus_tokens=False,
+                posterior_threshold=\
+                    typical_acceptance_sampler_posterior_threshold,
+                posterior_alpha=typical_acceptance_sampler_posterior_alpha,
+            )
+        logger.info("Configuring SpecDecodeWorker with sampler=%s",
+                    type(spec_decode_sampler))
+
+        return SpecDecodeWorker(
+            proposer_worker,
+            scorer_worker,
+            disable_logprobs=disable_logprobs,
+            disable_by_batch_size=disable_by_batch_size,
+            spec_decode_sampler=spec_decode_sampler,
+            allow_zero_draft_token_step=allow_zero_draft_token_step)
 
     def __init__(
         self,
         proposer_worker: ProposerWorkerBase,
         scorer_worker: WorkerBase,
-        rejection_sampler: RejectionSampler,
+        spec_decode_sampler: SpecDecodeBaseSampler,
+        disable_logprobs: bool,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
         disable_by_batch_size: Optional[int] = None,
+        allow_zero_draft_token_step: Optional[bool] = True,
     ):
         """
         Create a SpecDecodeWorker.
@@ -139,31 +194,49 @@ def __init__(
             scorer_worker: A worker that produces probabilities of speculative
                 tokens according to some base model. Typically a vanilla vLLM
                 Worker.
-            rejection_sampler: A Torch module used to perform modified rejection
-                sampling for speculative decoding.
+            spec_decode_sampler: A Torch module used to perform acceptance
+                sampling of the draft tokens in the verification step of
+                speculative decoding. Currently we support two different 
+                types of sampler namely RejectionSampler and
+                TypicalAcceptanceSampler. 'spec_decode_sampler' is either an
+                instance of RejectionSampler or TypicalAcceptanceSampler.
+            disable_logprobs: If set to True, token log probabilities will
+                not be output in both the draft worker and the target worker.
+                If set to False, log probabilities will be output by both.
             disable_by_batch_size: If the batch size is larger than this,
                 disable speculative decoding for new incoming requests.
             metrics_collector: Helper class for collecting metrics; can be set
                 for testing purposes.
+            allow_zero_draft_token_step: whether to allow a step where the draft
+                model generates no draft token; should disallow when the tp of
+                draft model is larger than 1 (TODO: #5814)
         """
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
-        self.rejection_sampler = rejection_sampler
-
+        self.spec_decode_sampler = spec_decode_sampler
+        self._allow_zero_draft_token_step = allow_zero_draft_token_step
         self._metrics = AsyncMetricsCollector(
-            rejection_sampler
+            self.spec_decode_sampler
         ) if metrics_collector is None else metrics_collector
-
-        self.probs_dtype = self.rejection_sampler.probs_dtype
-        self.token_id_dtype = self.rejection_sampler.token_id_dtype
-
-        # Lazy initiazliation.
+        # Tracks the sequence IDs that received a bonus token ID in
+        # their last forward pass. Needed only if KV cache is being
+        # used for token generation such as in the case of MultiStepWorker.
+        self._seq_with_bonus_token_in_last_step: Set[int] = set()
+        # Tracks the currently active request ids and the sequence IDs
+        # corresponding to them
+        self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set)
+        # Tracks if the proposer worker uses the KV cache or not.
+
+        self.probs_dtype = self.spec_decode_sampler.probs_dtype
+        self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
+        # Lazy initialization.
         self.scorer: SpeculativeScorer
 
         # Hidden states from target model to pass to proposer
         # in the subsequent step.
         self.previous_hidden_states: Optional[HiddenStates] = None
+        self._disable_logprobs = disable_logprobs
 
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
@@ -178,7 +251,8 @@ def init_device(self) -> None:
         self.proposer_worker.load_model()
 
         self._metrics.init_gpu_tensors(self.rank)
-        self.rejection_sampler.init_gpu_tensors(self.rank)
+        self.spec_decode_sampler.init_gpu_tensors(self.rank)
+
         self.scorer = BatchExpansionTop1Scorer(
             scorer_worker=self.scorer_worker,
             device=self.device,
@@ -192,7 +266,7 @@ def load_model(self, *args, **kwargs):
     def _configure_model_sampler_for_spec_decode(self):
         """Configure model sampler to emit GPU tensors. This allows spec decode
         to keep data on device without transferring to CPU and serializing,
-        which significantly reduces overhead of rejection sampling.
+        which significantly reduces overhead of sampling during verification.
 
         NOTE(cade): This breaks abstraction boundaries pretty badly. The better
         design is to have the "move to CPU and serialize" sampling decision be
@@ -262,6 +336,7 @@ def execute_model(
             broadcast_tensor_dict({}, src=0)
             return []
 
+        self._track_finished_requests(execute_model_req)
         disable_all_speculation = self._should_disable_all_speculation(
             execute_model_req)
         num_lookahead_slots = execute_model_req.num_lookahead_slots
@@ -297,7 +372,6 @@ def execute_model(
         ) == 0 or disable_all_speculation:
             return self._run_no_spec(execute_model_req,
                                      skip_proposer=disable_all_speculation)
-
         return self._run_speculative_decoding_step(execute_model_req,
                                                    num_lookahead_slots)
 
@@ -331,6 +405,42 @@ def _maybe_disable_speculative_tokens(
             # this state within spec decode worker.
             seq_group_metadata.num_speculative_tokens = 0
 
+    def _serialize_sampler_output_no_logprobs(
+            self, execute_model_req: ExecuteModelRequest,
+            sampler_output: SamplerOutput) -> SamplerOutput:
+        """
+        Creates and returns a `SamplerOutput` with only the sampled token IDs 
+        being serialized to CPU & populated in `CompletionSequenceGroupOutput`.
+        All other parameters in `CompletionSequenceGroupOutput` related to log 
+        probabilities are skipped.
+
+        Args:
+            execute_model_req (ExecuteModelRequest): The model request that
+            was executed.
+            sampler_output (SamplerOutput): The output from the sampler with
+            only GPU tensors populated.
+
+        Returns:
+            SamplerOutput: A new `SamplerOutput` instance containing a list of 
+            `CompletionSequenceGroupOutput` objects with only sampled token
+            IDs populated.
+        """
+        seq_ids = get_all_seq_ids(execute_model_req.seq_group_metadata_list)
+        sampled_token_ids_list = sampler_output.sampled_token_ids.tolist()
+        completion_seq_group_output_list: List[
+            CompletionSequenceGroupOutput] = []
+        for index, seq_id in enumerate(seq_ids):
+            completion_seq_group_output_list.append(
+                create_sequence_group_output(
+                    token_id=sampled_token_ids_list[index][0],
+                    token_id_logprob_rank=-1,
+                    token_id_logprob=0.0,
+                    seq_id=seq_id,
+                    topk_token_ids=[],
+                    topk_logprobs=[],
+                ))
+        return SamplerOutput(outputs=completion_seq_group_output_list)
+
     @nvtx_range("spec_decode_worker._run_no_spec")
     def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
                      skip_proposer: bool) -> List[SamplerOutput]:
@@ -357,12 +467,17 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
                 self.previous_hidden_states.update(
                     execute_model_req.seq_group_metadata_list, hidden_states)
 
+        sampler_output_to_return = (self._serialize_sampler_output_no_logprobs(
+            execute_model_req=execute_model_req, sampler_output=sampler_output)
+                                    if self._disable_logprobs else
+                                    sampler_output)
+
         # Clear device tensors from sampler output. This reduces communication
         # overhead when the engine runs in a different process than the workers.
-        sampler_output.probs = None
-        sampler_output.sampled_tokens = None
+        sampler_output.sampled_token_probs = None
+        sampler_output.sampled_token_ids = None
         sampler_output.logprobs = None
-        return [sampler_output]
+        return [sampler_output_to_return]
 
     def _run_non_driver_rank(self) -> bool:
         """Run proposer and verifier model in non-driver workers. This is used
@@ -408,13 +523,18 @@ def _run_speculative_decoding_step(
         self.previous_hidden_states = None
 
         # Generate proposals using draft worker.
-        proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
+        proposals = self.proposer_worker.get_spec_proposals(
+            execute_model_req, self._seq_with_bonus_token_in_last_step)
+
+        if not self._allow_zero_draft_token_step and proposals.no_proposals:
+            #TODO: Fix it #5814
+            raise RuntimeError("Cannot handle cases where distributed draft "
+                               "workers generate no tokens")
 
         proposal_scores = self.scorer.score_proposals(
             execute_model_req,
             proposals,
         )
-
         accepted_token_ids, target_logprobs = self._verify_tokens(
             execute_model_req.seq_group_metadata_list, proposal_scores,
             proposals, execute_model_req.num_lookahead_slots)
@@ -470,11 +590,28 @@ def _verify_tokens(
         # Get proposed tokens.
         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
 
-        accepted_token_ids = self.rejection_sampler(
+        # Sampler arguments
+        sampler_extra_kwargs = {}
+        if isinstance(self.spec_decode_sampler,
+                      SpecDecodeStochasticBaseSampler):
+
+            # Get sequence group state
+            generators = []
+            for seq_group_metadata in seq_group_metadata_list:
+                if (seq_group_metadata.state is not None
+                        and seq_group_metadata.state.generator is not None):
+                    generators.append(seq_group_metadata.state.generator)
+                else:
+                    generators.append(None)
+
+            sampler_extra_kwargs["generators"] = generators
+
+        accepted_token_ids = self.spec_decode_sampler(
             target_probs=proposal_verifier_probs,
             bonus_token_ids=bonus_token_ids,
             draft_probs=proposal_probs,
             draft_token_ids=proposal_token_ids,
+            **sampler_extra_kwargs,
         )
 
         # Append output tokens from non-speculative sequences to
@@ -485,7 +622,6 @@ def _verify_tokens(
         accepted_token_ids = torch.cat(
             [accepted_token_ids, non_spec_token_ids])
         logprobs = proposal_scores.logprobs
-
         # Rearrange so that results are in the order of the original seq group
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
@@ -519,39 +655,37 @@ def _create_output_sampler_list(
         the same number of outputs.
         """
         batch_size, num_steps = accepted_token_ids.shape
-
-        # Organize input tensors by step instead of by sequence.
-        target_logprobs_by_step = target_logprobs.transpose(0, 1)
         accepted_token_ids_by_step = accepted_token_ids.transpose(0, 1)
-
-        # Get the logprobs/rank of the accepted tokens.
-        (accepted_token_id_ranks_by_step,
-         accepted_token_id_logprobs_by_step) = get_sampled_token_logprobs(
-             logprob_tensor=target_logprobs_by_step,
-             sampled_token_ids=accepted_token_ids_by_step,
-         )
-
-        # Get the top-k logprobs (which may or may not include the logprob of
-        # the accepted token).
-        (topk_logprobs_by_step,
-         topk_indices_by_step) = target_logprobs_by_step.topk(
-             k=self.scorer_worker.model_config.max_logprobs,
-             dim=-1,
-         )
+        if self._disable_logprobs:
+            # We are skipping the logprobs. Hence don't serialize the
+            # logprobs related tensors from the GPU. Instead create
+            # empty/dummy lists.
+            (accepted_token_id_ranks_by_step,
+            accepted_token_id_logprobs_by_step,
+            topk_logprobs_by_step, topk_indices_by_step) =\
+            self._create_dummy_logprob_lists(
+                batch_size, num_steps,
+                self.scorer_worker.model_config.max_logprobs)
+        else:
+            # Organize input tensors by step instead of by sequence.
+            target_logprobs_by_step = target_logprobs.transpose(0, 1)
+            # Serialize all tensors into Python lists.
+            (accepted_token_id_ranks_by_step,
+            accepted_token_id_logprobs_by_step,
+            topk_logprobs_by_step, topk_indices_by_step) =\
+                self._create_logprob_lists_from_tensors(
+                    target_logprobs_by_step, accepted_token_ids_by_step,
+                    self.scorer_worker.model_config.max_logprobs)
 
         # Get the sequence ids and num_logprobs (sampling parameter) in the
         # batch.
-        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+        seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids(
+            seq_group_metadata_list)
+
         num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list)
 
-        # Serialize all tensors to CPU Python lists.
+        # Serialize tensor to CPU Python list.
         accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-        accepted_token_id_ranks_by_step = (
-            accepted_token_id_ranks_by_step.tolist())
-        accepted_token_id_logprobs_by_step = (
-            accepted_token_id_logprobs_by_step.tolist())
-        topk_logprobs_by_step = topk_logprobs_by_step.tolist()
-        topk_indices_by_step = topk_indices_by_step.tolist()
 
         # Construct the output on a per-step, per-sequence basis.
         sampler_output_list: List[SamplerOutput] = []
@@ -564,7 +698,6 @@ def _create_output_sampler_list(
             for sequence_index in range(batch_size):
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
-
                 step_output_token_ids.append(
                     create_sequence_group_output(
                         token_id=accepted_token_ids_by_step[step_index]
@@ -579,18 +712,150 @@ def _create_output_sampler_list(
                         topk_logprobs=topk_logprobs_by_step[step_index]
                         [sequence_index][:num_logprobs],
                     ))
-
             sampler_output_list.append(
                 SamplerOutput(outputs=step_output_token_ids))
 
+        # Populate the data structures needed to keep track of sequences with
+        # bonus tokens.
+        self._track_sequences_with_bonus_tokens(seq_ids,
+                                                request_ids_seq_ids_mapping,
+                                                accepted_token_ids_by_step)
         maybe_rejsample_metrics = (
             self._metrics.maybe_collect_rejsample_metrics(k))
         if maybe_rejsample_metrics is not None:
             sampler_output_list[
                 0].spec_decode_worker_metrics = maybe_rejsample_metrics
-
         return sampler_output_list
 
+    def _create_dummy_logprob_lists(
+        self,
+        batch_size: int,
+        num_steps: int,
+        num_top_k: int,
+    ) -> Tuple[List[List[int]], List[List[float]],
+               List[List[List[Optional[float]]]],
+               List[List[List[Optional[int]]]]]:
+        """
+        Creates and returns four dummy lists representing token probabilities 
+        and their ranks.
+
+        This method initializes and returns:
+            - The ranks of the accepted tokens, shaped (num_steps, batch_size)
+            - The log probabilities of the accepted tokens,
+              shaped (num_steps, batch_size)
+            - The log probabilities of the top k tokens,
+              shaped (num_steps, batch_size, num_top_k)
+            - The token IDs of the top k tokens,
+              shaped (num_steps, batch_size, num_top_k)
+
+        Args:
+            batch_size (int): The size of the batch.
+            num_steps (int): The number of steps in the sequence.
+            num_top_k (int): The number of top-k token log probabilities to
+            return.
+        
+        Returns:
+            A tuple containing four dummy lists as described above.
+        """
+        accepted_token_id_ranks_by_step = [[-1] * batch_size
+                                           for _ in range(num_steps)]
+        accepted_token_id_logprobs_by_step = [[0.0] * batch_size
+                                              for _ in range(num_steps)]
+        topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[
+            [None] * num_top_k for _ in range(batch_size)
+        ] for _ in range(num_steps)]
+        topk_indices_by_step: List[List[List[Optional[int]]]] = [[
+            [None] * num_top_k for _ in range(batch_size)
+        ] for _ in range(num_steps)]
+        return (accepted_token_id_ranks_by_step,
+                accepted_token_id_logprobs_by_step, topk_logprobs_by_step,
+                topk_indices_by_step)
+
+    def _create_logprob_lists_from_tensors(
+        self,
+        target_logprobs_by_step: torch.Tensor,
+        accepted_token_ids_by_step: torch.Tensor,
+        num_top_k: int,
+    ) -> Tuple[List[List[int]], List[List[float]],
+               List[List[List[Optional[float]]]],
+               List[List[List[Optional[int]]]]]:
+        """
+        Creates and returns four lists representing token probabilities and
+        their ranks.
+
+        This method initializes and returns four lists containing:
+            - The ranks of the accepted tokens, shaped (num_steps, batch_size)
+            - The log probabilities of the accepted tokens,
+              shaped (num_steps, batch_size)
+            - The log probabilities of the top k tokens,
+              shaped (num_steps, batch_size, num_top_k)
+            - The token IDs of the top k tokens,
+              shaped (num_steps, batch_size, num_top_k)
+
+        Args:
+            target_logprobs_by_step (torch.Tensor): Tensor representing the
+            log probabilities of the target model,
+            shaped (num_steps, batch_size, vocab_size)
+            accepted_token_ids_by_step (torch.Tensor): Tensor representing
+            the accepted  token_ids, shaped (num_steps, batch_size) 
+            num_top_k (int): The number of top-k token log probabilities to
+            return.
+        
+        Returns:
+            A tuple containing the lists as described above.
+        """
+        # Serialize all tensors to CPU Python lists.
+        # Get the logprobs/rank of the accepted tokens.
+        (accepted_token_id_ranks_by_step_tensor,
+         accepted_token_id_logprobs_by_step_tensor
+         ) = get_sampled_token_logprobs(
+             logprob_tensor=target_logprobs_by_step,
+             sampled_token_ids=accepted_token_ids_by_step,
+         )
+        # Get the top-k logprobs (which may or may not include the
+        # logprob of the accepted token).
+        (topk_logprobs_by_step_tensor,
+         topk_indices_by_step_tensor) = target_logprobs_by_step.topk(
+             k=num_top_k,
+             dim=-1,
+         )
+        accepted_token_id_ranks_by_step = (
+            accepted_token_id_ranks_by_step_tensor.tolist())
+        accepted_token_id_logprobs_by_step = (
+            accepted_token_id_logprobs_by_step_tensor.tolist())
+        topk_logprobs_by_step = topk_logprobs_by_step_tensor.tolist()
+        topk_indices_by_step = topk_indices_by_step_tensor.tolist()
+        return (accepted_token_id_ranks_by_step,
+                accepted_token_id_logprobs_by_step, topk_logprobs_by_step,
+                topk_indices_by_step)
+
+    def _track_finished_requests(self, execute_model_req: ExecuteModelRequest):
+        """
+        Removes the finished requests and their associated sequence ids from
+        internal book keeping data structures.
+        """
+        for finished_request in execute_model_req.finished_requests_ids:
+            for seq_id in self._request_id_seq_id_mapping[finished_request]:
+                self._seq_with_bonus_token_in_last_step.discard(seq_id)
+            del self._request_id_seq_id_mapping[finished_request]
+
+    def _track_sequences_with_bonus_tokens(
+            self, seq_ids: List[int],
+            request_ids_seq_ids_mapping: Dict[str, Set[int]],
+            accepted_token_ids_by_step: List[List[int]]):
+        """
+        Updates the internal data structures which keep track of sequences
+        which have been assigned bonus tokens in their last forward pass.
+        """
+        for seq_index, seq_id in enumerate(seq_ids):
+            last_token_id = accepted_token_ids_by_step[-1][seq_index]
+            if last_token_id == -1:
+                self._seq_with_bonus_token_in_last_step.discard(seq_id)
+            else:
+                self._seq_with_bonus_token_in_last_step.add(seq_id)
+        for request_id, sequences in request_ids_seq_ids_mapping.items():
+            self._request_id_seq_id_mapping[request_id].update(sequences)
+
     @cached_property
     def _vocab_size(self) -> int:
         """Get the vocab size of the model and make sure it's consistent between
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
new file mode 100644
index 0000000000000..957f2f8c8843e
--- /dev/null
+++ b/vllm/spec_decode/target_model_runner.py
@@ -0,0 +1,69 @@
+from typing import List, Optional
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
+from vllm.sequence import SequenceGroupMetadata
+from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
+                                      ModelRunner)
+
+
+class TargetModelRunner(ModelRunner):
+    """Specialized model runner for speculative decoding target model.
+    In speculative decoding, the log probabilities selected finally may not
+    be the same ones as selected by the target model sampling. This means
+    that the time spent in the log probability calculation of the target model
+    is time wasted, since we calculate log probabilities after deciding which
+    tokens are accepted. For this reason disabling log probabilities in the
+    target model will make decode faster. The model runner sets the
+    SamplingMetadata parameters according to whether log probabilities are
+    requested or not. 
+    """
+
+    def __init__(self,
+                 model_config: ModelConfig,
+                 parallel_config: ParallelConfig,
+                 scheduler_config: SchedulerConfig,
+                 device_config: DeviceConfig,
+                 cache_config: CacheConfig,
+                 load_config: LoadConfig,
+                 lora_config: Optional[LoRAConfig],
+                 kv_cache_dtype: Optional[str] = "auto",
+                 is_driver_worker: bool = False,
+                 prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+                 multimodal_config: Optional[MultiModalConfig] = None,
+                 return_hidden_states: bool = False):
+        # An internal boolean member variable to indicate if token log
+        # probabilities are needed or not.
+        self.disable_logprobs = True
+        super().__init__(
+            model_config=model_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            cache_config=cache_config,
+            load_config=load_config,
+            lora_config=lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            multimodal_config=multimodal_config,
+            prompt_adapter_config=prompt_adapter_config,
+            return_hidden_states=return_hidden_states,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        model_input: ModelInputForGPUWithSamplingMetadata = super(
+        ).prepare_model_input(seq_group_metadata_list, virtual_engine,
+                              finished_requests_ids)
+        # If token log probabilities is disabled then skip generating sampler
+        # CPU output. We directly serialize the GPU sampled_token_id tensors
+        # as needed. If log probabilities is enabled then synchronize all the
+        # sampling related tensors which includes the logprobs tensors.
+        model_input.sampling_metadata.skip_sampler_cpu_output = (
+            self.disable_logprobs)
+        return model_input
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index d3e280e6843b8..1a56497030280 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -42,6 +42,7 @@ def __init__(
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Get speculative proposals given the input batch.
 
@@ -76,6 +77,8 @@ def get_spec_proposals(
             maybe_sampler_output, transposed = self._worker.sampler_output(
                 execute_model_req=nonzero_execute_model_req,
                 sample_len=proposal_len,
+                seq_ids_with_bonus_token_in_last_step=\
+                    seq_ids_with_bonus_token_in_last_step,
             )
             (
                 proposal_lens,
@@ -105,7 +108,7 @@ def get_spec_proposals(
             proposal_token_ids=proposal_tokens,
             proposal_probs=proposal_probs,
             proposal_lens=proposal_lens,
-        )
+            no_proposals=maybe_sampler_output is None)
 
         return proposals
 
@@ -135,7 +138,7 @@ def _split_by_proposal_len(
 
             # Currently only proposal lens of 0 or the global batch proposal len
             # are supported.
-            # If max_proposal_len is defined, then we shall no exccess this
+            # If max_proposal_len is defined, then we shall no exceed this
             # quota for nonzero_proposal
             new_k = 0
             if (self.max_proposal_len is None
@@ -216,7 +219,7 @@ def _merge_outputs(
         proposal_lens: List[int],
         nonzero_proposal_len_indices: List[int],
         sampler_transposed: bool,
-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """After speculations are produced, merge the speculation results with
         the skipped sequences.
         """
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 80710419e602d..ade546eef264e 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,5 +1,5 @@
 from contextlib import contextmanager
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
@@ -53,8 +53,8 @@ def create_sequence_group_output(
     token_id_logprob_rank: int,
     token_id_logprob: float,
     seq_id: SeqId,
-    topk_token_ids: List[int],
-    topk_logprobs: List[float],
+    topk_token_ids: List[Optional[int]],
+    topk_logprobs: List[Optional[float]],
 ) -> CompletionSequenceGroupOutput:
     """Create a SequenceGroupOutput given the sampling results.
 
@@ -68,7 +68,7 @@ def create_sequence_group_output(
     """
     # vLLM logprobs always include the sampled token. In addition, the user may
     # request topk-logprobs (where top-k varies per user up to max_logprobs).
-    logprobs: Dict[int, Logprob] = {
+    logprobs: Dict[Optional[int], Logprob] = {
         token_id: Logprob(
             logprob=token_id_logprob,
             rank=token_id_logprob_rank,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 60fc756a12e3d..f99bea356da88 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,12 +1,13 @@
 import contextlib
 from typing import Dict, Optional, Type
 
-from transformers import PretrainedConfig
+from transformers import GenerationConfig, PretrainedConfig
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MLPSpeculatorConfig,
+from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig,
+                                             DbrxConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
                                              MPTConfig, RWConfig)
 
 if VLLM_USE_MODELSCOPE:
@@ -17,6 +18,7 @@
 logger = init_logger(__name__)
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    "chameleon": ChameleonConfig,
     "chatglm": ChatGLMConfig,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,
@@ -24,6 +26,7 @@
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
+    "medusa": MedusaConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
@@ -80,3 +83,25 @@ def get_hf_text_config(config: PretrainedConfig):
         return config.text_config
     else:
         return config
+
+
+def try_get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[GenerationConfig]:
+    try:
+        return GenerationConfig.from_pretrained(
+            model,
+            revision=revision,
+        )
+    except OSError:  # Not found
+        try:
+            config = get_config(
+                model,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+            )
+            return GenerationConfig.from_model_config(config)
+        except OSError:  # Not found
+            return None
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d8170858c2a9a..080c0777ebdcc 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,3 +1,5 @@
+from vllm.transformers_utils.configs.chameleon import (ChameleonConfig,
+                                                       ChameleonVQVAEConfig)
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
@@ -5,14 +7,18 @@
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 __all__ = [
+    "ChameleonConfig",
+    "ChameleonVQVAEConfig",
     "ChatGLMConfig",
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
     "JAISConfig",
+    "MedusaConfig",
     "MLPSpeculatorConfig",
 ]
diff --git a/vllm/transformers_utils/configs/chameleon.py b/vllm/transformers_utils/configs/chameleon.py
new file mode 100644
index 0000000000000..c1ac1182e14c4
--- /dev/null
+++ b/vllm/transformers_utils/configs/chameleon.py
@@ -0,0 +1,138 @@
+from typing import List, Optional
+
+from transformers import PretrainedConfig
+
+
+#TODO (ywang96): Remove this file and import it from
+# transformers once the new release with Chameleon support
+# is available.
+class ChameleonConfig(PretrainedConfig):
+    model_type = "chameleon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65536,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        model_parallel_size=1,
+        swin_norm=False,
+        vq_config=None,
+        vocabulary_map=None,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_bias = mlp_bias
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.model_parallel_size = model_parallel_size
+        self.swin_norm = swin_norm
+
+        if vq_config is None:
+            vq_config = {}
+
+        self.vq_config = ChameleonVQVAEConfig(**vq_config)
+
+        self.vocabulary_map = vocabulary_map
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling,
+                          dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, "
+                f"got {rope_scaling_factor}")
+
+
+class ChameleonVQVAEConfig(PretrainedConfig):
+
+    model_type = "chameleon_vqgan"
+
+    def __init__(
+        self,
+        embed_dim: int = 256,
+        num_embeddings: int = 8192,
+        double_latent: bool = False,
+        latent_channels: int = 256,
+        resolution: int = 512,
+        in_channels: int = 3,
+        base_channels: int = 128,
+        channel_multiplier: List[int] = [1, 1, 2, 2, 4],  #noqa
+        num_res_blocks: int = 2,
+        attn_resolutions: Optional[List[int]] = None,
+        dropout: float = 0.0,
+        attn_type: str = "vanilla",
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_embeddings = num_embeddings
+        self.double_latent = double_latent
+        self.latent_channels = latent_channels
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.channel_multiplier = channel_multiplier
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.dropout = dropout
+        self.attn_type = attn_type
+        self.initializer_range = initializer_range
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
new file mode 100644
index 0000000000000..d71a08343be2a
--- /dev/null
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,60 @@
+import os
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        for k in list(config_dict.keys()):
+            if 'num' in k:
+                if 'heads' in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif 'layers' in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index dd1d92b861b81..946af4e919f7c 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -17,6 +17,8 @@ def __init__(self,
                  n_predict: int = 3,
                  top_k_tokens_per_head: Optional[List[int]] = None,
                  n_candidates: int = 5,
+                 tie_weights: bool = False,
+                 scale_input: bool = False,
                  **kwargs):
         """
         Initialize an MLPSpeculatorConfig
@@ -35,8 +37,17 @@ def __init__(self,
                 candidate tree.
                 For each candidate branch in the tree, head n produces topk[n]
                 additional sub-branches.
+                NOTE: This parameter is currently unused.
             n_candidates: int
                 number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
         """
         if top_k_tokens_per_head is None:
             top_k_tokens_per_head = [5, 4, 3]
@@ -47,4 +58,8 @@ def __init__(self,
         self.n_predict = n_predict
         self.top_k_tokens_per_head = top_k_tokens_per_head
         self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
+
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index e8e53f4946efa..0a45028e7759b 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -21,14 +21,17 @@ def get_tokenizer_for_seq(self,
         """Returns the HF tokenizer to use for a given sequence."""
         return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
 
-    def decode_prompt_logprobs_inplace(
-            self, seq_group: SequenceGroup,
-            prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
+    def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
+                                       prompt_logprobs: List[Optional[Dict[
+                                           int, Logprob]]],
+                                       position_offset: int) -> None:
         """Decodes the logprobs for the prompt of a sequence group.
 
         Args:
             seq_group: The sequence group to decode.
             prompt_logprobs: The logprobs to decode.
+            position_offset: Offset of the first index of the logprobs 
+                relative to the start of the sequence (for chunked prefill).
         
         Returns:
             The prompt logprobs with the decoded tokens.
@@ -47,8 +50,13 @@ def decode_prompt_logprobs_inplace(
         next_iter_tokens: List[str] = []
         prev_tokens = None
 
-        for token_position, prompt_logprobs_for_token in enumerate(
+        for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
                 prompt_logprobs):
+
+            # Absolute token position equals the index in the logprobs
+            # list plus the offset of the entire logprobs list relative
+            # to the start of the sequence.
+            token_position = token_position_in_logprob + position_offset
             if not prompt_logprobs_for_token:
                 continue
             for token_id, sample_logprob in prompt_logprobs_for_token.items():
@@ -157,6 +165,12 @@ def decode_sequence_inplace(self, seq: Sequence,
         return len(new_decoded_token_text)
 
 
+def _replace_none_with_empty(tokens: List[Optional[str]]):
+    for i, token in enumerate(tokens):
+        if token is None:
+            tokens[i] = ""
+
+
 def _convert_tokens_to_string_with_added_encoders(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     output_tokens: List[str],
@@ -215,6 +229,8 @@ def convert_prompt_ids_to_tokens(
     read_offset = len(new_tokens)
     prefix_offset = max(
         read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    # This is required to guard against out-of-vocab prompt token ids
+    _replace_none_with_empty(new_tokens)
     return new_tokens, prefix_offset, read_offset
 
 
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3239b1d0cfa2f..c7d9eabd06f0e 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,28 +1,23 @@
-from functools import lru_cache
-from typing import Optional
-
-from transformers import AutoImageProcessor
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from typing import cast
 
 
 def get_image_processor(
     processor_name: str,
     *args,
     trust_remote_code: bool = False,
-    revision: Optional[str] = None,
     **kwargs,
-) -> BaseImageProcessor:
+):
     """Gets an image processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
+    from transformers.image_processing_utils import BaseImageProcessor
+
     try:
-        processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
+        processor = AutoImageProcessor.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            revision=revision,
             **kwargs)
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
@@ -39,7 +34,4 @@ def get_image_processor(
         else:
             raise e
 
-    return processor
-
-
-cached_get_image_processor = lru_cache(get_image_processor)
+    return cast(BaseImageProcessor, processor)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f5684dbf1271c..c515f46ecc299 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -88,6 +88,9 @@ def get_tokenizer(
                 "Cannot use the fast tokenizer in slow tokenizer mode.")
         kwargs["use_fast"] = False
 
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
     try:
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name,
@@ -134,14 +137,13 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
     if lora_request is None:
         return None
     try:
-        tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
-                                  **kwargs)
+        tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
     except OSError as e:
         # No tokenizer was found in the LoRA folder,
         # use base model tokenizer
         logger.warning(
             "No tokenizer found in %s, using base model tokenizer instead. "
-            "(Exception: %s)", lora_request.lora_local_path, e)
+            "(Exception: %s)", lora_request.lora_path, e)
         tokenizer = None
     return tokenizer
 
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 0195c40c27f60..9f54f5409b181 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Type
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
@@ -16,18 +16,22 @@
 
 def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
                         **init_kwargs) -> BaseTokenizerGroup:
+    tokenizer_cls: Type[BaseTokenizerGroup]
     if tokenizer_pool_config is None:
-        return TokenizerGroup(**init_kwargs)
-    if tokenizer_pool_config.pool_type == "ray":
+        tokenizer_cls = TokenizerGroup
+    elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
+            tokenizer_pool_config.pool_type, BaseTokenizerGroup):
+        tokenizer_cls = tokenizer_pool_config.pool_type
+    elif tokenizer_pool_config.pool_type == "ray":
         if RayTokenizerGroupPool is None:
             raise ImportError(
                 "RayTokenizerGroupPool is not available. Please install "
                 "the ray package to use the Ray tokenizer group pool.")
-        return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
-                                                 **init_kwargs)
+        tokenizer_cls = RayTokenizerGroupPool
     else:
         raise ValueError(
             f"Unknown pool type: {tokenizer_pool_config.pool_type}")
+    return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
 
 
 __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 3cce96e06d1a0..9682db6966ddf 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -3,12 +3,19 @@
 
 from transformers import PreTrainedTokenizer
 
+from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 
 
 class BaseTokenizerGroup(ABC):
     """A group of tokenizers that can be used for LoRA adapters."""
 
+    @classmethod
+    @abstractmethod
+    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
+                    **init_kwargs) -> "BaseTokenizerGroup":
+        pass
+
     @abstractmethod
     def ping(self) -> bool:
         """Check if the tokenizer group is alive."""
@@ -53,3 +60,7 @@ async def get_lora_tokenizer_async(
     ) -> "PreTrainedTokenizer":
         """Get a tokenizer for a LoRA request."""
         pass
+
+    def check_health(self):
+        """Raise exception if the tokenizer group is unhealthy."""
+        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 7c605416854b8..32384398a4c12 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -2,17 +2,25 @@
 import os
 from typing import List, Optional
 
+try:
+    from ray.exceptions import ActorDiedError
+except ImportError:
+    # For older versions of Ray
+    from ray.exceptions import RayActorError as ActorDiedError
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
     TokenizerGroup)
 
+logger = init_logger(__name__)
+
 
 class RayTokenizerGroupPool(BaseTokenizerGroup):
     """A Ray-based pool of TokenizerGroups for async tokenization."""
@@ -21,8 +29,10 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
     _worker_cls = TokenizerGroup
 
     @classmethod
-    def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
+    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
                     **init_kwargs) -> "RayTokenizerGroupPool":
+        if not tokenizer_pool_config:
+            raise ValueError("tokenizer_pool_config must not be None.")
         ray_actor_options = (tokenizer_pool_config.extra_config or {
             "num_cpus": 0
         })
@@ -46,24 +56,28 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
                  ray_actor_options: dict, **tokenizer_config):
         # Store a local copy of the TokenizerGroup for quick access
         # to underlying HF tokenizers.
+        self._tokenizer_config = {
+            "tokenizer_id": tokenizer_id,
+            "enable_lora": enable_lora,
+            "max_num_seqs": max_num_seqs,
+            "max_input_length": max_input_length,
+            **tokenizer_config
+        }
         self._local_tokenizer_group = self._worker_cls(
-            tokenizer_id=tokenizer_id,
-            enable_lora=enable_lora,
-            max_num_seqs=max_num_seqs,
-            max_input_length=max_input_length,
-            **tokenizer_config,
-        )
-
-        ray_tokenizer_group_cls = ray.remote(
+            **self._tokenizer_config, )
+
+        self._ray_tokenizer_group_cls = ray.remote(
             self._worker_cls).options(**ray_actor_options)
-        self.tokenizer_actors = [
-            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
-                                           max_num_seqs, max_input_length,
-                                           **tokenizer_config)
-            for _ in range(num_actors)
-        ]
+        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
+        # If set, actor is unhealthy. Will reraise on the next
+        # check_health call.
+        self._exception: Optional[ActorDiedError] = None
+
+    def _init_actor(self) -> ray.ObjectRef:
+        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
+
     @property
     def pool_size(self) -> int:
         return len(self.tokenizer_actors)
@@ -78,6 +92,22 @@ def _ensure_queue_initialized(self):
             for actor in self.tokenizer_actors:
                 self._idle_actors.put_nowait(actor)
 
+    def _finalize_encode(self, actor: ray.ObjectRef,
+                         original_actor: ray.ObjectRef, actor_is_alive: bool):
+        assert self._idle_actors is not None
+        # Cleanup the dead actor.
+        if not actor_is_alive or original_actor is not actor:
+            self.tokenizer_actors.remove(original_actor)
+        if actor_is_alive:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+            # Add back the new actor.
+            if original_actor is not actor:
+                self.tokenizer_actors.append(actor)
+
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
@@ -88,23 +118,41 @@ def encode(self,
         The actor is then put back in the queue for future use.
         This is blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
         actor = self._idle_actors.get_nowait()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
                                     lora_request=lora_request))
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = ray.get(
+                    actor.encode.remote(request_id=request_id,
+                                        prompt=prompt,
+                                        lora_request=lora_request))
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     async def encode_async(
@@ -120,20 +168,37 @@ async def encode_async(
         The actor is then put back in the queue for future use.
         This is non-blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = await actor.encode.remote(request_id=request_id,
                                             prompt=prompt,
                                             lora_request=lora_request)
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = await actor.encode.remote(request_id=request_id,
+                                                prompt=prompt,
+                                                lora_request=lora_request)
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     def get_max_input_len(self,
@@ -155,6 +220,11 @@ async def get_lora_tokenizer_async(
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
+    def check_health(self):
+        if self._exception:
+            raise RuntimeError(
+                "TokenizerGroupPool is unhealthy.") from self._exception
+
 
 def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
     """Copy over all current process environment variables to the runtime_env.
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 9614f01d2b955..74c041f13bad9 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -2,6 +2,7 @@
 
 from transformers import PreTrainedTokenizer
 
+from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
                                                get_lora_tokenizer_async,
@@ -24,6 +25,11 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
             capacity=max_num_seqs) if enable_lora else None
 
+    @classmethod
+    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
+                    **init_kwargs) -> "TokenizerGroup":
+        return cls(**init_kwargs)
+
     def ping(self) -> bool:
         """Check if the tokenizer group is alive."""
         return True
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
new file mode 100644
index 0000000000000..09843e5d1f30b
--- /dev/null
+++ b/vllm/triton_utils/__init__.py
@@ -0,0 +1,6 @@
+from vllm.triton_utils.custom_cache_manager import (
+    maybe_set_triton_cache_manager)
+
+__all__ = [
+    "maybe_set_triton_cache_manager",
+]
diff --git a/extras/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
similarity index 58%
rename from extras/custom_cache_manager.py
rename to vllm/triton_utils/custom_cache_manager.py
index c83ed5b6e8658..17039d7ba24c7 100644
--- a/extras/custom_cache_manager.py
+++ b/vllm/triton_utils/custom_cache_manager.py
@@ -3,8 +3,31 @@
 from triton.runtime.cache import (FileCacheManager, default_cache_dir,
                                   default_dump_dir, default_override_dir)
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def maybe_set_triton_cache_manager() -> None:
+    """Set environment variable to tell Triton to use a
+    custom cache manager"""
+    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
+    if cache_manger is None:
+        manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager"
+        logger.info("Setting Triton cache manager to: %s", manager)
+        os.environ["TRITON_CACHE_MANAGER"] = manager
+
 
 class CustomCacheManager(FileCacheManager):
+    """Re-implements Triton's cache manager, ensuring that a
+    unique cache directory is created for each process. This is
+    needed to avoid collisions when running with tp>1 and
+    using multi-processing as the distributed backend.
+
+    Note this issue was fixed by triton-lang/triton/pull/4295,
+    but the fix is not yet included in triton==v3.0.0. However,
+    it should be included in the subsequent version.
+    """
 
     def __init__(self, key, override=False, dump=False):
         self.key = key
@@ -28,5 +51,3 @@ def __init__(self, key, override=False, dump=False):
                 os.makedirs(self.cache_dir, exist_ok=True)
             else:
                 raise RuntimeError("Could not create or locate cache dir")
-
-        print(f"Triton cache dir: {self.cache_dir=}")
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index afb3007a528b4..515e0a4d8abe7 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 from uuid import uuid4
 
 import cpuinfo
@@ -16,15 +16,22 @@
 import torch
 
 import vllm.envs as envs
+from vllm.connections import global_http_connection
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
-_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
-_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
-                                              "vllm/do_not_track")
+_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
+_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
+_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
+
+
+def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+    """Set global usage data that will be sent with every usage heartbeat."""
+    _GLOBAL_RUNTIME_DATA[key] = value
+
 
 def is_usage_stats_enabled():
     """Determine whether or not we can send usage stats to the server.
@@ -187,14 +194,19 @@ def _report_continous_usage(self):
         """
         while True:
             time.sleep(600)
-            data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
+            data = {
+                "uuid": self.uuid,
+                "log_time": _get_current_timestamp_ns(),
+            }
+            data.update(_GLOBAL_RUNTIME_DATA)
 
             self._write_to_file(data)
             self._send_to_server(data)
 
     def _send_to_server(self, data):
         try:
-            requests.post(_USAGE_STATS_SERVER, json=data)
+            global_http_client = global_http_connection.get_sync_client()
+            global_http_client.post(_USAGE_STATS_SERVER, json=data)
         except requests.exceptions.RequestException:
             # silently ignore unless we are using debug log
             logging.debug("Failed to send usage data to server")
diff --git a/vllm/utils.py b/vllm/utils.py
index 27a7b1042d88f..83605631b5bd6 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import contextlib
 import datetime
 import enum
 import gc
@@ -15,10 +16,11 @@
 from functools import lru_cache, partial, wraps
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+                    Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
                     Union)
 
 import numpy as np
+import numpy.typing as npt
 import psutil
 import torch
 import torch.types
@@ -39,11 +41,27 @@
     "fp8_e5m2": torch.uint8,
 }
 
+TORCH_DTYPE_TO_NUMPY_DTYPE = {
+    torch.float16: np.float16,
+    torch.float32: np.float32,
+    torch.float64: np.float64,
+    torch.uint8: np.uint8,
+    torch.int32: np.int32,
+    torch.int64: np.int64,
+}
+
 P = ParamSpec('P')
 K = TypeVar("K")
 T = TypeVar("T")
 
 
+class _Sentinel:
+    ...
+
+
+ALL_PINNED_SENTINEL = _Sentinel()
+
+
 class Device(enum.Enum):
     GPU = enum.auto()
     CPU = enum.auto()
@@ -67,6 +85,7 @@ class LRUCache(Generic[T]):
 
     def __init__(self, capacity: int):
         self.cache: OrderedDict[Hashable, T] = OrderedDict()
+        self.pinned_items: Set[Hashable] = set()
         self.capacity = capacity
 
     def __contains__(self, key: Hashable) -> bool:
@@ -102,14 +121,36 @@ def put(self, key: Hashable, value: T) -> None:
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
+    def pin(self, key: Hashable) -> None:
+        """
+        Pins a key in the cache preventing it from being
+        evicted in the LRU order.
+        """
+        if key not in self.cache:
+            raise ValueError(f"Cannot pin key: {key} not in cache.")
+        self.pinned_items.add(key)
+
+    def _unpin(self, key: Hashable) -> None:
+        self.pinned_items.remove(key)
+
     def _on_remove(self, key: Hashable, value: Optional[T]):
         pass
 
-    def remove_oldest(self):
+    def remove_oldest(self, remove_pinned=False):
         if not self.cache:
             return
-        key, value = self.cache.popitem(last=False)
-        self._on_remove(key, value)
+
+        if not remove_pinned:
+            # pop the oldest item in the cache that is not pinned
+            lru_key = next(
+                (key for key in self.cache if key not in self.pinned_items),
+                ALL_PINNED_SENTINEL)
+            if lru_key is ALL_PINNED_SENTINEL:
+                raise RuntimeError("All items are pinned, "
+                                   "cannot remove oldest from the cache.")
+        else:
+            lru_key = next(iter(self.cache))
+        self.pop(lru_key)
 
     def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
@@ -120,13 +161,16 @@ def pop(self,
             default_value: Optional[T] = None) -> Optional[T]:
         run_on_remove = key in self.cache
         value: Optional[T] = self.cache.pop(key, default_value)
+        # remove from pinned items
+        if key in self.pinned_items:
+            self._unpin(key)
         if run_on_remove:
             self._on_remove(key, value)
         return value
 
     def clear(self):
         while len(self.cache) > 0:
-            self.remove_oldest()
+            self.remove_oldest(remove_pinned=True)
         self.cache.clear()
 
 
@@ -143,6 +187,15 @@ def is_cpu() -> bool:
         return False
 
 
+@lru_cache(maxsize=None)
+def is_openvino() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "openvino" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
 @lru_cache(maxsize=None)
 def is_neuron() -> bool:
     try:
@@ -351,9 +404,31 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
+def init_kmp_env():
+    if not is_cpu():
+        return
+
+    ld_prealod_str = os.getenv("LD_PRELOAD", "")
+    if "libiomp5.so" not in ld_prealod_str:
+        return
+
+    # The time(milliseconds) that a thread should wait after completing the
+    # execution of a parallel region, before sleeping.
+    os.environ['KMP_BLOCKTIME'] = "1"
+    # dump settings on start up
+    os.environ['KMP_SETTINGS'] = "1"
+    # Prevents the CPU to run into low performance state
+    os.environ['KMP_TPAUSE'] = "0"
+    # Provides fine granularity parallelism
+    os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+    os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+    os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+
+def chunk_list(lst: List[T], chunk_size: int):
     """Yield successive chunk_size chunks from lst."""
-    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i:i + chunk_size]
 
 
 def cdiv(a: int, b: int) -> int:
@@ -509,7 +584,7 @@ def is_pin_memory_available() -> bool:
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
-    elif is_cpu():
+    elif is_cpu() or is_openvino():
         return False
     return True
 
@@ -552,23 +627,54 @@ def str_to_int_tuple(s: str) -> Tuple[int, ...]:
             f"(e.g., 1, 2, 3). Given input: {s}") from e
 
 
-def make_tensor_with_pad(
-    x: List[List[int]],
-    max_len: int,
-    pad: int,
-    dtype: torch.dtype,
-    device: Optional[Union[str, torch.device]],
-) -> torch.Tensor:
-    """Make a padded tensor of a 2D inputs.
+def make_ndarray_with_pad(
+    x: List[List[T]],
+    pad: T,
+    dtype: npt.DTypeLike,
+    *,
+    max_len: Optional[int] = None,
+) -> npt.NDArray:
+    """
+    Make a padded array from 2D inputs.
 
     The padding is applied to the end of each inner list until it reaches
     `max_len`.
     """
-    padded_x = np.zeros([len(x), max_len], dtype=np.int32) + pad
+    if max_len is None:
+        # Unlike for most functions, map is faster than a genexpr over `len`
+        max_len = max(map(len, x), default=0)
+
+    padded_x = np.full((len(x), max_len), pad, dtype=dtype)
     for ind, blocktb in enumerate(x):
         assert len(blocktb) <= max_len
         padded_x[ind, :len(blocktb)] = blocktb
-    return torch.tensor(padded_x, dtype=dtype, device=device)
+
+    return padded_x
+
+
+def make_tensor_with_pad(
+    x: List[List[T]],
+    pad: T,
+    dtype: torch.dtype,
+    *,
+    max_len: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    pin_memory: bool = False,
+) -> torch.Tensor:
+    """
+    Make a padded tensor from 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
+    padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
+
+    tensor = torch.from_numpy(padded_x).to(device)
+    if pin_memory:
+        tensor = tensor.pin_memory()
+
+    return tensor
 
 
 def async_tensor_h2d(
@@ -613,6 +719,11 @@ def merge_dicts(dict1: Dict[K, List[T]],
     return dict(merged_dict)
 
 
+def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
+    """Flatten a list of lists to a single list."""
+    return [item for sublist in lists for item in sublist]
+
+
 def init_cached_hf_modules() -> None:
     """
     Lazy initialization of the Hugging Face modules.
@@ -746,9 +857,14 @@ def _cuda_device_count_stateless(
 
     if not torch.cuda._is_compiled():
         return 0
-    # bypass _device_count_nvml() if rocm (not supported)
-    nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
-    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    if is_hip():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
+            torch.cuda, "_device_count_amdsmi")) else -1
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
     return r
 
 
@@ -762,10 +878,80 @@ def cuda_device_count_stateless() -> int:
 
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
-
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+def error_on_invalid_device_count_status():
+    cache_entries = 0
+    with contextlib.suppress(Exception):
+        # future pytorch will fix the issue, device_count will not be cached
+        # at that time, `.cache_info().currsize` will error out
+        cache_entries = torch.cuda.device_count.cache_info().currsize
+    if cache_entries != 0:
+        # the function is already called, and the result is cached
+        remembered = torch.cuda.device_count()
+        current = cuda_device_count_stateless()
+        if remembered > current:
+            raise RuntimeError(
+                "The number of CUDA devices has changed since the first "
+                "call to torch.cuda.device_count(). This is not allowed "
+                "and may result in undefined behavior. Please check out "
+                "https://github.com/vllm-project/vllm/issues/6056 to "
+                "find the first call to torch.cuda.device_count() "
+                "and defer it until the engine is up. Or you can set "
+                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
+
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+
+try:
+    import pynvml
+except ImportError:
+    # For non-NV devices
+    pynvml = None
+
+
+def with_nvml_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if pynvml is not None:
+            pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            if pynvml is not None:
+                pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@with_nvml_context
+def is_full_nvlink(device_ids: List[int]) -> bool:
+    """
+    query if the set of gpus are fully connected by nvlink (1 hop)
+    """
+    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+    for i, handle in enumerate(handles):
+        for j, peer_handle in enumerate(handles):
+            if i < j:
+                try:
+                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                        return False
+                except pynvml.NVMLError as error:
+                    logger.error(
+                        "NVLink detection failed. This is normal if your"
+                        " machine has no NVLink equipped.",
+                        exc_info=error)
+                    return False
+    return True
+
+
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):
 
@@ -789,8 +975,21 @@ def parse_args(self, args=None, namespace=None):
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
-                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = '--' + key[len('--'):].replace('_', '-')
+                    processed_args.append(f'{key}={value}')
+                else:
+                    processed_args.append('--' +
+                                          arg[len('--'):].replace('_', '-'))
             else:
                 processed_args.append(arg)
 
         return super().parse_args(processed_args, namespace)
+
+
+async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
+                              **kwargs):
+    """Utility function to run async task in a lock"""
+    async with lock:
+        return await task(*args, **kwargs)
diff --git a/vllm/version.py b/vllm/version.py
index 2b33ffcf5b811..6930654710632 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1 +1,12 @@
-__version__ = "0.5.0.post1"
+import warnings
+
+try:
+    import vllm.commit_id
+    __commit__ = vllm.commit_id.__commit__
+except Exception as e:
+    warnings.warn(f"Failed to read commit hash:\n{e}",
+                  RuntimeWarning,
+                  stacklevel=2)
+    __commit__ = "COMMIT_HASH_PLACEHOLDER"
+
+__version__ = "0.5.3.post1"
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index fbd1343fea19c..252440c7b7e08 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -33,12 +33,18 @@ def __init__(
         self.device_config = device_config
 
         self.head_size = model_config.get_head_size()
-        self.num_layers = model_config.get_num_layers(parallel_config)
+        # Models like Jamba, have mixed typed layers, E.g Mamba
+        self.num_attention_layers = model_config.get_num_attention_layers(
+            parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
+        if self.num_gpu_blocks:
+            self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
         self.num_cpu_blocks = cache_config.num_cpu_blocks
+        if self.num_cpu_blocks:
+            self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
 
         if cache_config.cache_dtype == "auto":
             self.dtype = model_config.dtype
@@ -71,7 +77,7 @@ def _allocate_kv_cache(
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
-        for _ in range(self.num_layers):
+        for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
@@ -83,12 +89,12 @@ def _allocate_kv_cache(
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_layers):
+        for i in range(self.num_attention_layers):
             self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
                                           src_to_dst)
 
     def swap_out(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_layers):
+        for i in range(self.num_attention_layers):
             self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
                                           src_to_dst)
 
@@ -103,11 +109,12 @@ def get_cache_block_size(
     ) -> int:
         head_size = model_config.get_head_size()
         num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
+        num_attention_layers = model_config.get_num_attention_layers(
+            parallel_config)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block
-        total = num_layers * (key_cache_block + value_cache_block)
+        total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
         else:
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d539f56937be1..83f4ba69fb728 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,27 +1,74 @@
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import make_tensor_with_pad
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
 
 
-class CPUModelRunner:
+@dataclass(frozen=True)
+class CPUModelInput(ModelRunnerInputBase):
+    """
+    Used by the CPUModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+            cls: Type["CPUModelInput"],
+            tensor_dict: Dict[str, Any],
+            attn_backend: Optional["AttentionBackend"] = None
+    ) -> "CPUModelInput":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
 
     def __init__(
         self,
@@ -32,8 +79,9 @@ def __init__(
         cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
@@ -46,7 +94,8 @@ def __init__(
         self.device_config = device_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
@@ -65,42 +114,34 @@ def __init__(
             self.block_size,
         )
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
     def load_model(self) -> None:
-        self.model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=self.device_config,
-            vision_language_config=self.vision_language_config,
-            lora_config=self.lora_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            cache_config=self.cache_config)
+        self.model = get_model(model_config=self.model_config,
+                               load_config=self.load_config,
+                               device_config=self.device_config,
+                               multimodal_config=self.multimodal_config,
+                               lora_config=self.lora_config,
+                               parallel_config=self.parallel_config,
+                               scheduler_config=self.scheduler_config,
+                               cache_config=self.cache_config)
 
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
-            str, torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -122,16 +163,9 @@ def _prepare_prompt(
             input_positions.extend(list(range(computed_len, seq_len)))
 
             mm_data = seq_group_metadata.multi_modal_data
-            if mm_data is not None:
-                # Process multi-modal data
-                if self.multi_modal_input_processor is None:
-                    raise ValueError(
-                        "Multi-modal inputs are only supported by "
-                        "vision language models.")
-
-                mm_kwargs = self.multi_modal_input_processor(mm_data)
-                for k, v in mm_kwargs.items():
-                    multi_modal_kwargs_list[k].append(v)
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
@@ -155,11 +189,6 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -183,6 +212,10 @@ def _prepare_prompt(
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
 
@@ -243,11 +276,8 @@ def _prepare_decode(
                                        dtype=torch.int,
                                        device=self.device)
 
-        max_block_table_len = max(
-            len(block_table) for block_table in block_tables)
         block_tables = make_tensor_with_pad(
             block_tables,
-            max_len=max_block_table_len,
             pad=0,
             dtype=torch.int,
             device=self.device,
@@ -270,94 +300,85 @@ def _prepare_decode(
             attn_metadata,
         )
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[Dict[str, torch.Tensor]]]:
+        tensor_dict: Dict[str, Any],
+    ) -> CPUModelInput:
+        return CPUModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0,
+            finished_requests_ids: Optional[List[str]] = None
+    ) -> CPUModelInput:
         multi_modal_kwargs = None
-        if self.is_driver_worker:
-            # NOTE: We assume that all sequences in the group are all prompts or
-            # all decodes.
-            is_prompt = seq_group_metadata_list[0].is_prompt
-            # Prepare input tensors.
-            if is_prompt:
-                (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_kwargs
-                 ) = self._prepare_prompt(seq_group_metadata_list)
-            else:
-                (input_tokens, input_positions,
-                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
-                seq_lens = []
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list,
-                seq_lens,
-                # query_lens is not needed if chunked prefill is not
-                # supported. Since CPU worker doesn't support chunked prefill
-                # just use seq_lens instead.
-                seq_lens,
-                self.device,
-                pin_memory=False)
-            # Broadcast the metadata.
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-            }
-            metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, attn_metadata, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                seq_data=None,
-                seq_lens=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                generators=None,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_kwargs)
+            (input_tokens, input_positions,
+             attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = []
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since CPU worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            pin_memory=False)
+        return CPUModelInput(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            sampling_metadata=sampling_metadata,
+            multi_modal_kwargs=multi_modal_kwargs,
+        )
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: CPUModelInput,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "CPU worker does not support multi-step execution.")
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config and multi_modal_input is not None:
-            execute_model_kwargs.update(multi_modal_input)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3ee394f9912e9..3c22c73267b7f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,22 +1,22 @@
 """A CPU worker class."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env
 from vllm.worker.cpu_model_runner import CPUModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -110,7 +110,7 @@ def get_cache_block_size(
         return dtype_size * total
 
 
-class CPUWorker(LoraNotSupportedWorkerBase):
+class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a CPU socket.
 
     Each worker is associated with a single CPU socket. The worker is 
@@ -131,8 +131,9 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -145,16 +146,20 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.prompt_adapter_config = prompt_adapter_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
+        # try to initialize intel openmp optimized tunings
+        init_kmp_env()
+
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.model_runner = CPUModelRunner(
+        self.model_runner: CPUModelRunner = CPUModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
@@ -162,13 +167,14 @@ def __init__(
             cache_config,
             load_config=self.load_config,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=kv_cache_dtype,
+            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CPUCacheEngine
-        self.cpu_cache: List[torch.Tensor]
+        self.cache_engine: List[CPUCacheEngine]
+        self.cpu_cache: List[List[torch.Tensor]]
 
     def init_device(self) -> None:
         self.init_distributed_environment()
@@ -242,68 +248,60 @@ def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
                 "initializing the engine.")
 
     def _init_cache_engine(self) -> None:
-        self.cache_engine = CPUCacheEngine(self.cache_config,
-                                           self.model_config,
-                                           self.parallel_config,
-                                           self.device_config)
-        self.cpu_cache = self.cache_engine.cpu_cache
-        self.model_runner.block_size = self.cache_engine.block_size
-
-        assert self.cpu_cache is not None
+        self.cache_engine = [
+            CPUCacheEngine(self.cache_config, self.model_config,
+                           self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.cpu_cache = [
+            self.cache_engine[ve].cpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.model_runner.block_size = self.cache_engine[0].block_size
+
+        assert all(
+            self.cpu_cache[ve] is not None
+            for ve in range(self.parallel_config.pipeline_parallel_size))
 
         # Populate the cache to warmup the memory
-        for layer_cache in self.cpu_cache:
-            layer_cache.fill_(0)
+        for ve in range(self.parallel_config.pipeline_parallel_size):
+            for layer_cache in self.cpu_cache[ve]:
+                layer_cache.fill_(0)
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.cpu_cache
 
-    def cache_copy(
+    def execute_worker(
         self,
-        blocks_to_copy: torch.Tensor,
+        worker_input: WorkerInput,
     ) -> None:
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[worker_input.virtual_engine].copy(
+                worker_input.blocks_to_copy)
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-
-        if execute_model_req is None:
-            seq_group_metadata_list = None
-        else:
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            num_seq_groups: int = len(seq_group_metadata_list)
-            assert execute_model_req is not None
-            blocks_to_copy = execute_model_req.blocks_to_copy
-            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-            assert len(execute_model_req.blocks_to_swap_in) == 0
-            assert len(execute_model_req.blocks_to_swap_out) == 0
-            data: Dict[str, Any] = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_copy": execute_model_req.blocks_to_copy,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_copy = data["blocks_to_copy"]
-
-        self.cache_copy(blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.cpu_cache)
-
-        # CPU worker only supports single-step execution.
-        return [output]
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        assert execute_model_req is not None
+        virtual_engine = execute_model_req.virtual_engine
+        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
+        blocks_to_copy = execute_model_req.blocks_to_copy
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device="cpu",
+                                      dtype=torch.int64).view(-1, 2)
+        assert len(execute_model_req.blocks_to_swap_in) == 0
+        assert len(execute_model_req.blocks_to_swap_out) == 0
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+        )
 
     def init_distributed_environment(self) -> None:
         """Initialize the distributed environment."""
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 465130d10e2f9..a333e6634a41f 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -1,24 +1,33 @@
-from typing import Dict, List, Optional, Set, Tuple
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 
-from vllm.attention import AttentionMetadata
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
-from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.model_runner import ModelRunner
+from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU
 
 logger = init_logger(__name__)
 
 
-class EmbeddingModelRunner(ModelRunner):
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
+    """
+    Used by the EmbeddingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class EmbeddingModelRunner(
+        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
+        ModelInputForGPUWithPoolingMetadata)
 
     def __init__(
         self,
@@ -31,7 +40,8 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
     ):
         super().__init__(model_config,
                          parallel_config,
@@ -42,27 +52,44 @@ def __init__(
                          lora_config=lora_config,
                          kv_cache_dtype=kv_cache_dtype,
                          is_driver_worker=is_driver_worker,
-                         vision_language_config=vision_language_config)
+                         prompt_adapter_config=prompt_adapter_config,
+                         multimodal_config=multimodal_config)
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        model_input: ModelInputForGPUWithPoolingMetadata,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[PoolerOutput]:
-        (input_tokens, input_positions, attn_metadata, pooling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[PoolerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "EmbeddingModelRunner does not support multi-step execution.")
 
         if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
 
         # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        virtual_engine = model_input.virtual_engine
         if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
+            model_executable = self.graph_runners[virtual_engine][
+                graph_batch_size]
         else:
             model_executable = self.model
 
@@ -70,80 +97,49 @@ def execute_model(
         kv_caches = [None] * num_layers
 
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
         if not self.is_driver_worker:
-            return None
-
-        return self.model.pooler(hidden_states=hidden_states,
-                                 pooling_metadata=pooling_metadata)
+            return []
+
+        return [
+            self.model.pooler(hidden_states=hidden_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForGPUWithPoolingMetadata:
+        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
 
-    def prepare_input_tensors(
+    def prepare_model_input(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                _,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            # Prepare PoolingMetadata
-            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                     seq_lens)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            pooling_metadata = PoolingMetadata(seq_groups=None,
-                                               seq_data=None,
-                                               prompt_lens=None)
-
-        return (input_tokens, input_positions, attn_metadata, pooling_metadata,
-                lora_requests, lora_mapping, multi_modal_kwargs)
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
 
     def _prepare_pooling(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e24835a1ea7fb..e63be184af16a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,19 +1,35 @@
+import dataclasses
 import gc
 import time
 import warnings
-from collections import defaultdict
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
+import weakref
+from dataclasses import dataclass, field
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
+                    Tuple, Type, TypeVar, Union)
 
 import numpy as np
 import torch
+import torch.distributed
 import torch.nn as nn
 
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    BatchDecodeWithPagedKVCacheWrapper = None
+    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
+from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -21,11 +37,30 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.model_executor.models.interfaces import (supports_lora,
+                                                   supports_vision)
+from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
+from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.prompt_adapter.worker_manager import (
+    LRUCacheWorkerPromptAdapterManager)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
-                        is_pin_memory_available, make_tensor_with_pad)
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
+from vllm.utils import (CudaMemoryProfiler, flatten_2d_lists,
+                        get_kv_cache_torch_dtype, is_hip,
+                        is_pin_memory_available)
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -39,40 +74,524 @@
 ]
 _NUM_WARMUP_ITERS = 2
 
+TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
+
 
-class ModelInput(NamedTuple):
-    input_tokens: torch.Tensor
-    input_positions: torch.Tensor
-    attn_metadata: Optional[AttentionMetadata]
-    seq_lens: List[int]
-    query_lens: List[int]
-    lora_mapping: Optional[LoRAMapping]
-    lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, torch.Tensor]
-    slot_mapping: torch.Tensor
-    num_prefill_tokens: int
-    num_decode_tokens: int
-    num_prefills: int
+@dataclass(frozen=True)
+class ModelInputForGPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
+    prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
+    finished_requests_ids: Optional[List[str]] = None
+    virtual_engine: int = 0
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "prompt_adapter_mapping": self.prompt_adapter_mapping,
+            "prompt_adapter_requests": self.prompt_adapter_requests,
+            "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
 
     @classmethod
-    def empty(cls, device):
-        return ModelInput(
-            input_tokens=torch.empty(0, device=device),
-            input_positions=torch.empty(0, device=device),
-            attn_metadata=None,
-            seq_lens=[],
-            query_lens=[],
-            lora_mapping=None,
-            lora_requests=set(),
-            multi_modal_kwargs={},
-            slot_mapping=torch.empty(0, device=device),
-            num_prefill_tokens=0,
-            num_decode_tokens=0,
-            num_prefills=0,
-        )
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForGPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForGPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+@dataclass(frozen=True)
+class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "prompt_adapter_mapping": self.prompt_adapter_mapping,
+            "prompt_adapter_requests": self.prompt_adapter_requests,
+            "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
 
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForGPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
+    """Build ModelInputForGPU from SequenceGroupMetadata."""
+
+    @dataclass
+    class InterDataForSeqGroup:
+        """Intermediate data for the current sequence group."""
+        # From sequence group metadata.
+        request_id: str
+        seq_ids: List[int]
+        is_prompt: bool
+        block_tables: Optional[Dict[int, List[int]]]
+        computed_block_nums: List[int]
+        n_seqs: int = 0
+
+        # Input tokens and positions.
+        input_tokens: List[List[int]] = field(default_factory=list)
+        input_positions: List[List[int]] = field(default_factory=list)
+
+        # The sequence length (may be capped to the sliding window).
+        seq_lens: List[int] = field(default_factory=list)
+        # The original sequence length (before applying sliding window).
+        # This is used to compute slot mapping.
+        orig_seq_lens: List[int] = field(default_factory=list)
+        # The query length.
+        query_lens: List[int] = field(default_factory=list)
+        # The number of tokens that are already computed.
+        context_lens: List[int] = field(default_factory=list)
+        # The current sliding window block.
+        curr_sliding_window_blocks: List[int] = field(default_factory=list)
+
+        # LoRA inputs.
+        lora_index_mapping: List[List[int]] = field(default_factory=list)
+        lora_prompt_mapping: List[List[int]] = field(default_factory=list)
+        lora_requests: Set[LoRARequest] = field(default_factory=set)
+
+        # Prompt adapter inputs.
+        prompt_adapter_index_mapping: List[int] = field(default_factory=list)
+        prompt_adapter_prompt_mapping: List[int] = field(default_factory=list)
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
+
+        # Multi-modal inputs.
+        multi_modal_inputs: Optional[MultiModalInputs] = None
+
+        # Whether the prefix cache is hit (prefill only).
+        prefix_cache_hit: bool = False
+
+        def __post_init__(self):
+            self.n_seqs = len(self.seq_ids)
+
+            self.input_tokens = [[] for _ in range(self.n_seqs)]
+            self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.seq_lens = [0] * self.n_seqs
+            self.orig_seq_lens = [0] * self.n_seqs
+            self.query_lens = [0] * self.n_seqs
+            self.context_lens = [0] * self.n_seqs
+            self.curr_sliding_window_blocks = [0] * self.n_seqs
+
+            self.lora_index_mapping = [[] for _ in range(self.n_seqs)]
+            self.lora_prompt_mapping = [[] for _ in range(self.n_seqs)]
+
+    def __init__(self,
+                 runner: "GPUModelRunnerBase",
+                 finished_requests_ids: Optional[List[str]] = None):
+        super().__init__()
+        # Compute functions for each sequence in a sequence group.
+        # WARNING: The order of the functions matters!
+        self.per_seq_compute_fns = [
+            self._compute_lens,
+            self._compute_for_prefix_cache_hit,
+            self._compute_for_sliding_window,
+            self._compute_lora_input,
+        ]
+        # Compute functions for each sequence group.
+        # WARNING: The order of the functions matters!
+        self.per_seq_group_compute_fns = [
+            self._compute_prompt_adapter_input,
+            self._compute_multi_modal_input,
+        ]
 
-class ModelRunner:
+        self.runner = runner
+        self.model_input_cls = self.runner._model_input_cls
+        self.attn_backend = self.runner.attn_backend
+        self.scheduler_config = self.runner.scheduler_config
+        self.sliding_window = self.runner.sliding_window
+        self.block_size = self.runner.block_size
+        self.enable_lora = self.runner.lora_config is not None
+        self.enable_prompt_adapter = (self.runner.prompt_adapter_config
+                                      is not None)
+        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
+        self.finished_requests_ids = finished_requests_ids
+        self.decode_only = True
+
+        # Intermediate data (data in CPU before going to GPU) for
+        # the current sequence group.
+        self.inter_data_list: List[
+            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
+
+        # Attention metadata inputs.
+        self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
+            weakref.proxy(self))
+
+        # Engine/Model configurations.
+        self.chunked_prefill_enabled = (
+            self.scheduler_config is not None
+            and self.scheduler_config.chunked_prefill_enabled)
+        if self.sliding_window is not None:
+            self.sliding_window_blocks = (
+                self.sliding_window + self.block_size - 1) // self.block_size
+            self.block_aligned_sliding_window = \
+                self.sliding_window_blocks * self.block_size
+
+    def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
+                      seq_group_metadata: SequenceGroupMetadata):
+        """Compute context length, sequence length and tokens
+        for the given sequence data.
+        """
+        seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]]
+        token_chunk_size = seq_group_metadata.token_chunk_size
+
+        # Compute context length (the number of tokens that are
+        # already computed) and sequence length (total number of tokens).
+        seq_len = seq_data.get_len()
+        if inter_data.is_prompt:
+            context_len = seq_data.get_num_computed_tokens()
+        else:
+            # get_num_computed_tokens is incorrect for spec decoding.
+            # So, we should have a special logic here.
+            # TODO(sang): Fix it.
+            context_len = seq_len - 1
+        seq_len = min(seq_len, context_len + token_chunk_size)
+
+        # Compute tokens.
+        if inter_data.is_prompt:
+            tokens = seq_data.get_token_ids()[context_len:seq_len]
+        else:
+            # Optimization. get_token_ids requires the entire copy of
+            # tokens.
+            tokens = [seq_data.get_last_token_id()]
+
+        inter_data.seq_lens[seq_idx] = seq_len
+        inter_data.orig_seq_lens[seq_idx] = seq_len
+        inter_data.context_lens[seq_idx] = context_len
+        inter_data.input_tokens[seq_idx] = tokens
+        inter_data.input_positions[seq_idx] = list(range(context_len, seq_len))
+        inter_data.query_lens[
+            seq_idx] = seq_len - context_len if inter_data.is_prompt else 1
+
+    def _compute_for_prefix_cache_hit(
+            self, inter_data: InterDataForSeqGroup, seq_idx: int,
+            seq_group_metadata: SequenceGroupMetadata):
+        """Check if hit prefix cache (i.e., some blocks are already computed).
+        If hit, update input tokens and positions to only compute the
+        remaining blocks.
+        """
+        computed_block_nums = inter_data.computed_block_nums
+
+        # Note that prefix caching does not support sliding window.
+        prefix_cache_hit = (computed_block_nums is not None
+                            and len(computed_block_nums) > 0
+                            and self.sliding_window is None
+                            and inter_data.is_prompt)
+        inter_data.prefix_cache_hit = prefix_cache_hit
+        if self.chunked_prefill_enabled and prefix_cache_hit:
+            raise RuntimeError(
+                "chunked prefill cannot be used with prefix caching now.")
+
+        # If prefix cache is hit, advance context length to bypass
+        # hit blocks. Accordingly, input tokens, position and query length
+        # have to be updated.
+        if prefix_cache_hit:
+            assert computed_block_nums is not None
+            context_len = len(computed_block_nums) * self.block_size
+            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
+                seq_idx][context_len:]
+            inter_data.input_positions[seq_idx] = inter_data.input_positions[
+                seq_idx][context_len:]
+            inter_data.context_lens[seq_idx] = context_len
+            inter_data.query_lens[
+                seq_idx] = inter_data.seq_lens[seq_idx] - context_len
+
+    def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
+                                    seq_idx: int,
+                                    seq_group_metadata: SequenceGroupMetadata):
+        """Update seq_len and curr_sliding_window_block for the given
+        sequence data (only required by decoding) if sliding window is enabled.
+        """
+        curr_sliding_window_block = 0
+        sliding_seq_len = inter_data.seq_lens[seq_idx]
+        if not inter_data.is_prompt and self.sliding_window is not None:
+            # TODO(sang): This is a hack to make sliding window work with
+            # paged attn. We can remove it if we make paged attn kernel
+            # to properly handle slinding window attn.
+            curr_sliding_window_block = self.sliding_window_blocks
+            if self.scheduler_config.use_v2_block_manager:
+                # number of elements in last block
+                suff_len = inter_data.seq_lens[seq_idx] % self.block_size
+                sliding_seq_len = min(
+                    inter_data.seq_lens[seq_idx],
+                    self.block_aligned_sliding_window + suff_len)
+                if suff_len > 0:
+                    curr_sliding_window_block += 1
+            else:
+                sliding_seq_len = min(inter_data.seq_lens[seq_idx],
+                                      self.sliding_window)
+
+        inter_data.curr_sliding_window_blocks[
+            seq_idx] = curr_sliding_window_block
+        inter_data.seq_lens[seq_idx] = sliding_seq_len
+
+    def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
+                            seq_idx: int,
+                            seq_group_metadata: SequenceGroupMetadata):
+        """If LoRA is enabled, compute LoRA index and prompt mapping."""
+        if not self.enable_lora:
+            return
+
+        lora_id = seq_group_metadata.lora_int_id
+        if lora_id > 0:
+            inter_data.lora_requests.add(seq_group_metadata.lora_request)
+        query_len = inter_data.query_lens[seq_idx]
+        inter_data.lora_index_mapping.append([lora_id] * query_len)
+        inter_data.lora_prompt_mapping.append(
+            [lora_id] *
+            (query_len if seq_group_metadata.sampling_params
+             and seq_group_metadata.sampling_params.prompt_logprobs is not None
+             else 1))
+
+    def _compute_prompt_adapter_input(
+            self, inter_data: InterDataForSeqGroup,
+            seq_group_metadata: SequenceGroupMetadata):
+        """If prompt adapter is enabled, compute index and prompt mapping.
+        """
+        # Note that when is_prompt=True, we expect only one sequence
+        # in the group.
+        if not self.enable_prompt_adapter:
+            return
+
+        prompt_adapter_id = seq_group_metadata.prompt_adapter_id
+        if prompt_adapter_id <= 0 or not inter_data.is_prompt:
+            return
+
+        # We expect only one sequence in the group when is_prompt=True.
+        assert inter_data.n_seqs == 1
+        query_len = inter_data.query_lens[0]
+        inter_data.prompt_adapter_request = (
+            seq_group_metadata.prompt_adapter_request)
+
+        num_tokens = seq_group_metadata.prompt_adapter_num_virtual_tokens
+        inter_data.prompt_adapter_index_mapping = [
+            prompt_adapter_id
+        ] * num_tokens + [0] * (query_len - num_tokens)
+        inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
+            query_len if seq_group_metadata.sampling_params
+            and seq_group_metadata.sampling_params.prompt_logprobs else 1)
+
+    def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
+                                   seq_group_metadata: SequenceGroupMetadata):
+        """If multi-modal data is given, add it to the input."""
+        mm_data = seq_group_metadata.multi_modal_data
+        if not mm_data:
+            return
+
+        mm_kwargs = self.multi_modal_input_mapper(mm_data)
+        inter_data.multi_modal_inputs = mm_kwargs
+
+    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
+        """Add a sequence group to the builder."""
+        seq_ids = list(seq_group_metadata.seq_data.keys())
+        n_seqs = len(seq_ids)
+        is_prompt = seq_group_metadata.is_prompt
+
+        if is_prompt:
+            assert n_seqs == 1
+            self.decode_only = False
+
+        inter_data = self.InterDataForSeqGroup(
+            request_id=seq_group_metadata.request_id,
+            seq_ids=seq_ids,
+            is_prompt=is_prompt,
+            block_tables=seq_group_metadata.block_tables,
+            computed_block_nums=seq_group_metadata.computed_block_nums)
+        self.inter_data_list.append(inter_data)
+
+        for seq_idx in range(n_seqs):
+            for per_seq_fn in self.per_seq_compute_fns:
+                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
+        for per_seq_group_fn in self.per_seq_group_compute_fns:
+            per_seq_group_fn(inter_data, seq_group_metadata)
+
+    def build(self) -> ModelInputForGPU:
+        """Finalize the builder intermediate data and
+        create on-device tensors.
+        """
+        # Combine and flatten intermediate data.
+        input_tokens = flatten_2d_lists([
+            flatten_2d_lists(inter_data.input_tokens)
+            for inter_data in self.inter_data_list
+        ])
+        if not input_tokens:
+            # This may happen when all prefill requests hit
+            # prefix caching and there is no decode request.
+            return self.model_input_cls()
+        input_positions = flatten_2d_lists([
+            flatten_2d_lists(inter_data.input_positions)
+            for inter_data in self.inter_data_list
+        ])
+        seq_lens = []
+        max_decode_seq_len = 0
+        for inter_data in self.inter_data_list:
+            seq_lens.extend(inter_data.seq_lens)
+            if not inter_data.is_prompt:
+                max_decode_seq_len = max(max_decode_seq_len,
+                                         max(inter_data.seq_lens))
+        query_lens = flatten_2d_lists(
+            [inter_data.query_lens for inter_data in self.inter_data_list])
+        # Mapping from request IDs to sequence IDs. Used for Jamba models
+        # that manages the cache by itself.
+        request_ids_to_seq_ids = {
+            data.request_id: data.seq_ids
+            for data in self.inter_data_list
+        }
+
+        batch_size = len(input_tokens)
+        use_captured_graph = (
+            self.decode_only and not self.runner.model_config.enforce_eager
+            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+            and max_decode_seq_len <= self.runner.max_seq_len_to_capture)
+
+        # If cuda graph can be used, pad tensors accordingly.
+        # See `capture_model` API for more details.
+        # vLLM uses cuda graph only for decoding requests.
+        cuda_graph_pad_size = -1
+        if use_captured_graph:
+            graph_batch_size = _get_graph_batch_size(batch_size)
+            assert graph_batch_size >= batch_size
+            cuda_graph_pad_size = graph_batch_size - batch_size
+            batch_size = graph_batch_size
+
+        # Tokens and positions.
+        input_tokens.extend([0] * cuda_graph_pad_size)
+        input_positions.extend([0] * cuda_graph_pad_size)
+        input_tokens_tensor = torch.tensor(input_tokens,
+                                           dtype=torch.long,
+                                           device=self.runner.device)
+        input_positions_tensor = torch.tensor(input_positions,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+
+        # Sequence and query lengths.
+        seq_lens.extend([1] * cuda_graph_pad_size)
+
+        # Attention metadata.
+        attn_metadata = self.attn_metadata_builder.build(
+            seq_lens, query_lens, cuda_graph_pad_size, batch_size)
+
+        # LoRA data.
+        lora_requests = set()
+        lora_mapping = None
+        if self.enable_lora:
+            lora_requests = set(r for data in self.inter_data_list
+                                for r in data.lora_requests)
+            lora_index_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_index_mapping)
+                for inter_data in self.inter_data_list
+            ])
+            lora_index_mapping.extend([0] * cuda_graph_pad_size)
+            lora_prompt_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_prompt_mapping)
+                for inter_data in self.inter_data_list
+            ])
+            lora_mapping = LoRAMapping(
+                lora_index_mapping,
+                lora_prompt_mapping,
+            )
+
+        # Prompt adapter data.
+        prompt_adapter_requests: Set[PromptAdapterRequest] = set()
+        prompt_adapter_mapping = None
+        if self.enable_prompt_adapter:
+            prompt_adapter_requests = set(
+                data.prompt_adapter_request for data in self.inter_data_list
+                if data.prompt_adapter_request is not None)
+            prompt_adapter_index_mapping = flatten_2d_lists([
+                inter_data.prompt_adapter_index_mapping
+                for inter_data in self.inter_data_list
+            ])
+            prompt_adapter_index_mapping.extend([0] * cuda_graph_pad_size)
+            prompt_adapter_prompt_mapping = flatten_2d_lists([
+                inter_data.prompt_adapter_prompt_mapping
+                for inter_data in self.inter_data_list
+            ])
+            prompt_adapter_mapping = PromptAdapterMapping(
+                prompt_adapter_index_mapping,
+                prompt_adapter_prompt_mapping,
+            )
+
+        # Multi-modal data.
+        multi_modal_inputs_list = [
+            data.multi_modal_inputs for data in self.inter_data_list
+            if data.multi_modal_inputs is not None
+        ]
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.runner.device)
+
+        return self.model_input_cls(
+            input_tokens=input_tokens_tensor,
+            input_positions=input_positions_tensor,
+            attn_metadata=attn_metadata,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
+            lora_mapping=lora_mapping,
+            lora_requests=lora_requests,
+            multi_modal_kwargs=multi_modal_kwargs,
+            request_ids_to_seq_ids=request_ids_to_seq_ids,
+            finished_requests_ids=self.finished_requests_ids,
+            prompt_adapter_mapping=prompt_adapter_mapping,
+            prompt_adapter_requests=prompt_adapter_requests)
+
+
+class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForGPU]
 
     def __init__(
         self,
@@ -85,7 +604,8 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         return_hidden_states: bool = False,
     ):
         self.model_config = model_config
@@ -96,7 +616,8 @@ def __init__(
         self.lora_config = lora_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
-        self.vision_language_config = vision_language_config
+        self.prompt_adapter_config = prompt_adapter_config
+        self.multimodal_config = multimodal_config
         self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
@@ -106,9 +627,16 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+
+        self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
+            {} for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
         self.graph_memory_pool: Optional[Tuple[
             int, int]] = None  # Set during graph capture.
+
+        self.has_seqlen_agnostic = model_config.contains_seqlen_agnostic_layers(
+            parallel_config)
+
         # When using CUDA graph, the input block tables must be padded to
         # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
@@ -130,49 +658,46 @@ def __init__(
             self.block_size,
         ) if num_attn_heads else None
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
-        # Set if the backend is flashinfer.
-        self.flashinfer_workspace_buffer: torch.Tensor
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+        self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
+
+        self.flashinfer_decode_workspace_buffer = None
+        self.flashinfer_decode_wrapper = None
+        self.flashinfer_prefill_workspace_buffer = None
+        self.flashinfer_prefill_wrapper = None
+
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
 
     def load_model(self) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
         with CudaMemoryProfiler() as m:
-            self.model = get_model(
-                model_config=self.model_config,
-                device_config=self.device_config,
-                load_config=self.load_config,
-                lora_config=self.lora_config,
-                vision_language_config=self.vision_language_config,
-                parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config,
-                cache_config=self.cache_config,
-            )
+            self.model = get_model(model_config=self.model_config,
+                                   device_config=self.device_config,
+                                   load_config=self.load_config,
+                                   lora_config=self.lora_config,
+                                   multimodal_config=self.multimodal_config,
+                                   parallel_config=self.parallel_config,
+                                   scheduler_config=self.scheduler_config,
+                                   cache_config=self.cache_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
         if self.lora_config:
-            assert hasattr(self.model, "supported_lora_modules"
-                           ) and self.model.supported_lora_modules, (
-                               "Model does not support LoRA")
-            assert hasattr(
-                self.model,
-                "embedding_modules"), "Model does not have embedding_modules"
-            assert hasattr(self.model, "embedding_padding_modules"
-                           ), "Model does not have embedding_padding_modules"
+            assert supports_lora(self.model), "Model does not support LoRA"
+            assert not supports_vision(
+                self.model
+            ), "To be tested: vision language model with LoRA settings."
+
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens,
@@ -186,6 +711,15 @@ def load_model(self) -> None:
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
+        if self.prompt_adapter_config:
+            self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens, self.device,
+                self.prompt_adapter_config)
+            self.model = (
+                self.prompt_adapter_manager.create_prompt_adapter_manager(
+                    self.model))
+
         if self.kv_cache_dtype == "fp8" and is_hip():
             # Currently only ROCm accepts kv-cache scaling factors
             # via quantization_param_path and this will be deprecated
@@ -241,11 +775,14 @@ def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
 
-    def _prepare_model_input(
+    def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInput:
-        """Prepare the model input based on a given sequence group.
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> TModelInputForGPU:
+        """Helper method to prepare the model input based on a given sequence
+        group. Prepares metadata needed for the base model forward pass but not
+        metadata for possible additional steps, e.g., sampling.
 
         The API assumes seq_group_metadata_list is sorted by prefill -> decode.
 
@@ -257,529 +794,11 @@ def _prepare_model_input(
 
         If cuda graph is required, this API automatically pads inputs.
         """
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        slot_mapping: List[int] = []
-        lora_index_mapping: List[int] = []
-        lora_prompt_mapping: List[int] = []
-        lora_requests: Set[LoRARequest] = set()
-
-        seq_lens: List[int] = []
-        prefill_seq_lens: List[int] = []
-        decode_seq_lens: List[int] = []
-        context_lens: List[int] = []
-        query_lens: List[int] = []
-        block_tables: List[List[int]] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
-        decode_only = True
-        num_prefills = 0
-        num_prefill_tokens = 0
-        num_decode_tokens = 0
-
-        # The following fields are only for flashinfer
-        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
-        # for the precise definition of the following fields.
-        # An example:
-        # request 1, page indices [0, 5, 8]
-        # request 2, page indices [1, 6, 7]
-        # request 3, page indices [3, 4]
-        # paged_kv_indices is a concatenation of page indices of all requests:
-        # [0, 5, 8, 1, 6, 7, 3, 4]
-        # paged_kv_indptr is used to index into paged_kv_indices:
-        # [0, 3, 6, 8]
-        paged_kv_indices: List[int] = []
-        # 0 at the beginning of paged_kv_indptr indicates the start of the
-        # first request’s page indices in the paged_kv_indices list.
-        paged_kv_indptr: List[int] = [0]
-        # paged_kv_last_page_len is the length of the last page of each request
-        paged_kv_last_page_len: List[int] = []
-
-        if len(seq_group_metadata_list) == 0:
-            return ModelInput.empty(self.device)
-
-        if self.sliding_window is not None:
-            sliding_window_blocks = (self.sliding_window + self.block_size -
-                                     1) // self.block_size
-            block_aligned_sliding_window = \
-                sliding_window_blocks * self.block_size
-
+        builder = ModelInputForGPUBuilder(weakref.proxy(self),
+                                          finished_requests_ids)
         for seq_group_metadata in seq_group_metadata_list:
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            is_prompt = seq_group_metadata.is_prompt
-
-            for seq_id in seq_ids:
-                computed_block_nums = seq_group_metadata.computed_block_nums
-                if (self.scheduler_config is not None
-                        and self.scheduler_config.chunked_prefill_enabled
-                        and not (computed_block_nums is None
-                                 or computed_block_nums == [])):
-                    raise RuntimeError(
-                        "chunked prefill cannot be used with prefix caching "
-                        "now.")
-
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                if is_prompt:
-                    context_len = seq_data.get_num_computed_tokens()
-                else:
-                    # get_num_computed_tokens is incorrect for spec decoding.
-                    # So, we should have a special logic here.
-                    # TODO(sang): Fix it.
-                    context_len = seq_data.get_len() - 1
-
-                seq_len = min(
-                    seq_data.get_len(),
-                    context_len + seq_group_metadata.token_chunk_size)
-                if is_prompt:
-                    tokens = seq_data.get_token_ids()[context_len:seq_len]
-                else:
-                    # Optimization. get_token_ids requires the entire copy of
-                    # tokens.
-                    tokens = [seq_data.get_last_token_id()]
-
-                # Prefix cache was hit.
-                # Prefix is not supported with sliding_window
-                prefix_cache_hit = (computed_block_nums is not None
-                                    and len(computed_block_nums) > 0
-                                    and self.sliding_window is None
-                                    and is_prompt)
-
-                # These are seq_len/context_len capped to the sliding window.
-                # They are passed to decode kernel.
-                # We still need original seq_len/context_len to compute slot
-                # mapping (and input position) below.
-                curr_sliding_window_blocks = None
-                sliding_seq_len = seq_len
-                sliding_context_len = context_len
-
-                # TODO(sang): This is a hack to make sliding window work with
-                # paged attn. We can remove it if we make paged attn kernel
-                # to properly handle slinding window attn.
-                if (self.sliding_window is not None and not is_prompt):
-                    curr_sliding_window_blocks = sliding_window_blocks
-                    if self.scheduler_config.use_v2_block_manager:
-                        # number of elements in last block
-                        suff_len = seq_len % self.block_size
-                        sliding_seq_len = min(
-                            seq_len, block_aligned_sliding_window + suff_len)
-                        if suff_len > 0:
-                            curr_sliding_window_blocks += 1
-                    else:
-                        sliding_seq_len = min(seq_len, self.sliding_window)
-                    sliding_context_len = sliding_seq_len - 1
-
-                # TODO(sang): Combine chunked prefill and prefix caching by
-                # only allowing multiple of block_size chunk size.
-                # NOTE: This only works for oooooooxxx style attention.
-                if prefix_cache_hit:
-                    assert computed_block_nums is not None
-                    context_len = len(computed_block_nums) * self.block_size
-                    tokens = tokens[context_len:]
-
-                    # need to think what to set it to when we have both sliding
-                    # window and prefix caching...
-                    assert self.sliding_window is None, \
-                        "Prefix caching is not supported with sliding window"
-                    sliding_context_len = context_len
-
-                    if self.attn_backend.get_name() == "flash-attn":
-                        # NOTE(woosuk): For flash-attn, the block table should
-                        # include the entries for the incoming prefill tokens.
-                        # TODO(woosuk): This is a temporary fix. We should
-                        # provide a unified interface for different backends.
-                        block_table = seq_group_metadata.block_tables[seq_id]
-                    else:
-                        block_table = computed_block_nums
-                elif (self.scheduler_config.chunked_prefill_enabled
-                      or not is_prompt):
-                    if seq_group_metadata.block_tables is not None:
-                        # chunked prefill or decode
-                        block_table = seq_group_metadata.block_tables[seq_id]
-                        if curr_sliding_window_blocks is not None:
-                            block_table = block_table[
-                                -curr_sliding_window_blocks:]
-                        if self.attn_backend.get_name() == "flashinfer":
-                            paged_kv_indices.extend(block_table)
-                            paged_kv_indptr.append(paged_kv_indptr[-1] +
-                                                   len(block_table))
-                            last_page_len = seq_data.get_len(
-                            ) % self.block_size
-                            if last_page_len == 0:
-                                last_page_len = self.block_size
-                            paged_kv_last_page_len.append(last_page_len)
-                    else:
-                        # Only happens when memory profiling runs.
-                        block_table = []
-                else:
-                    # Prefill without chunked prefill or memory profiling.
-                    block_table = []
-                block_tables.append(block_table)
-
-                seq_lens.append(sliding_seq_len)
-                context_lens.append(sliding_context_len)
-                query_len = sliding_seq_len - sliding_context_len
-                query_lens.append(query_len)
-                input_tokens.extend(tokens)
-                input_positions.extend(list(range(context_len, seq_len)))
-                lora_id = seq_group_metadata.lora_int_id
-
-                if is_prompt:
-                    assert len(seq_ids) == 1
-                    num_prefills += 1
-                    num_prefill_tokens += len(tokens)
-                    decode_only = False
-                    prefill_seq_lens.append(seq_len)
-                else:
-                    assert query_len == 1, (
-                        "seq_len: {}, context_len: {}, query_len: {}".format(
-                            seq_len, context_len, query_len))
-                    num_decode_tokens += query_len
-                    decode_seq_lens.append(sliding_seq_len)
-
-                if lora_id > 0:
-                    lora_requests.add(seq_group_metadata.lora_request)
-
-                lora_index_mapping += [lora_id] * query_len
-                lora_prompt_mapping.extend(
-                    [lora_id] *
-                    (query_len if seq_group_metadata.sampling_params
-                     and seq_group_metadata.sampling_params.prompt_logprobs
-                     is not None else 1))
-
-                mm_data = seq_group_metadata.multi_modal_data
-                if mm_data is not None:
-                    # Process multi-modal data
-                    if self.multi_modal_input_processor is None:
-                        raise ValueError(
-                            "Multi-modal inputs are only supported by "
-                            "vision language models.")
-
-                    mm_kwargs = self.multi_modal_input_processor(mm_data)
-                    for k, v in mm_kwargs.items():
-                        multi_modal_kwargs_list[k].append(v)
-
-                if _is_block_tables_empty(seq_group_metadata.block_tables):
-                    # During memory profiling, the block tables are not
-                    # initialized yet. In this case, we just use a dummy
-                    # slot mapping.
-                    # In embeddings, the block tables are {seq_id: None}.
-                    slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
-                    continue
-
-                # Compute the slot mapping.
-                block_table = seq_group_metadata.block_tables[seq_id]
-
-                # Mask the [0, start_idx) tokens of the prompt with
-                # _PAD_SLOT_ID, where start_idx is max(0, seq_len -
-                # sliding_window). For example, if the prompt len is 10,
-                # sliding window is 8, and block size is 4, the first two
-                # tokens are masked and the slot mapping will be
-                # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
-                start_idx = 0
-                if self.sliding_window is not None:
-                    if is_prompt:
-                        assert self.scheduler_config.use_v2_block_manager \
-                            or context_len == 0, (
-                            "Prefix caching is currently not supported with "
-                            "sliding window attention in V1 block manager")
-                    # It is an optimization. When it is decoding, it is always
-                    # 0. When prefill, we use it to not write slots to kv cache
-                    # to save memory.
-                    start_idx = max(0, query_len - self.sliding_window)
-
-                for i in range(context_len, seq_len):
-                    if i < start_idx:
-                        slot_mapping.append(_PAD_SLOT_ID)
-                        continue
-
-                    block_number = block_table[i // self.block_size]
-                    block_offset = i % self.block_size
-                    slot = block_number * self.block_size + block_offset
-                    slot_mapping.append(slot)
-
-        batch_size = len(input_tokens)
-        max_query_len = max(query_lens)
-        max_prefill_seq_len = max(prefill_seq_lens, default=0)
-        max_decode_seq_len = max(decode_seq_lens, default=0)
-
-        # If cuda graph can be used, pad tensors accordingly.
-        # See `capture_model` API for more details.
-        # vLLM uses cuda graph only for decoding requests.
-        use_captured_graph = (
-            decode_only and not self.model_config.enforce_eager
-            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
-            and max_decode_seq_len <= self.max_seq_len_to_capture)
-        if use_captured_graph:
-            graph_batch_size = _get_graph_batch_size(batch_size)
-            assert graph_batch_size >= batch_size
-            for _ in range(graph_batch_size - batch_size):
-                input_tokens.append(0)
-                input_positions.append(0)
-                slot_mapping.append(_PAD_SLOT_ID)
-                seq_lens.append(1)
-                block_tables.append([])
-                lora_index_mapping.append(0)
-            batch_size = graph_batch_size
-            num_decode_tokens = batch_size
-
-        if use_captured_graph:
-            # The shape of graph_block_tables is
-            # [max batch size, max context len // block size].
-            input_block_tables = self.graph_block_tables[:batch_size]
-            for i, block_table in enumerate(block_tables):
-                if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
-            block_tables = torch.tensor(input_block_tables, device=self.device)
-        else:
-            max_block_table_len = max(
-                len(block_table) for block_table in block_tables)
-            block_tables = make_tensor_with_pad(
-                block_tables,
-                max_len=max_block_table_len,
-                pad=0,
-                dtype=torch.int,
-                device=self.device,
-            )
-        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
-
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=self.device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=self.device)
-
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-
-        input_tokens_tensor = torch.tensor(input_tokens,
-                                           dtype=torch.long,
-                                           device=self.device)
-        input_positions_tensor = torch.tensor(input_positions,
-                                              dtype=torch.long,
-                                              device=self.device)
-        slot_mapping_tensor = torch.tensor(slot_mapping,
-                                           dtype=torch.long,
-                                           device=self.device)
-
-        if self.attn_backend.get_name() == "flashinfer":
-            if not hasattr(self, "flashinfer_workspace_buffer"):
-                # Allocate 16MB workspace buffer
-                # Follow the example of flashinfer: https://docs.flashinfer.ai/api/python/decode.html
-                self.flashinfer_workspace_buffer = torch.empty(
-                    16 * 1024 * 1024, dtype=torch.uint8, device=self.device)
-            paged_kv_indptr_tensor = torch.tensor(paged_kv_indptr,
-                                                  dtype=torch.int,
-                                                  device=self.device)
-            paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
-                                                   dtype=torch.int,
-                                                   device=self.device)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                paged_kv_last_page_len, dtype=torch.int, device=self.device)
-            kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
-                                                      self.model_config.dtype)
-            attn_metadata = self.attn_backend.make_metadata(
-                num_prefills=num_prefills,
-                slot_mapping=slot_mapping_tensor,
-                num_prefill_tokens=num_prefill_tokens,
-                num_decode_tokens=num_decode_tokens,
-                use_cuda_graph=False,
-                max_prefill_seq_len=max_prefill_seq_len,
-                block_tables=block_tables,
-                workspace_buffer=self.flashinfer_workspace_buffer,
-                paged_kv_indptr=paged_kv_indptr_tensor,
-                paged_kv_indices=paged_kv_indices_tensor,
-                paged_kv_last_page_len=paged_kv_last_page_len_tensor,
-                num_qo_heads=self.model_config.get_num_attention_heads(
-                    self.parallel_config),
-                num_kv_heads=self.model_config.get_num_kv_heads(
-                    self.parallel_config),
-                head_dim=self.model_config.get_head_size(),
-                page_size=16,
-                seq_start_loc=seq_start_loc,
-                data_type=kv_cache_dtype)
-        else:
-            context_lens_tensor = torch.tensor(context_lens,
-                                               dtype=torch.int,
-                                               device=self.device)
-            query_lens_tensor = torch.tensor(query_lens,
-                                             dtype=torch.long,
-                                             device=self.device)
-            query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=query_start_loc.dtype,
-                         out=query_start_loc[1:])
-
-            attn_metadata = self.attn_backend.make_metadata(
-                num_prefills=num_prefills,
-                slot_mapping=slot_mapping_tensor,
-                num_prefill_tokens=num_prefill_tokens,
-                num_decode_tokens=num_decode_tokens,
-                seq_lens=seq_lens,
-                seq_lens_tensor=seq_lens_tensor,
-                max_query_len=max_query_len,
-                max_prefill_seq_len=max_prefill_seq_len,
-                max_decode_seq_len=max_decode_seq_len,
-                query_start_loc=query_start_loc,
-                seq_start_loc=seq_start_loc,
-                context_lens_tensor=context_lens_tensor,
-                block_tables=block_tables,
-                use_cuda_graph=use_captured_graph,
-            )
-
-        if self.lora_config:
-            lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
-        else:
-            lora_mapping = None
-
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
-
-        return ModelInput(
-            input_tokens=input_tokens_tensor,
-            input_positions=input_positions_tensor,
-            attn_metadata=attn_metadata,
-            seq_lens=seq_lens,
-            query_lens=query_lens,
-            lora_mapping=lora_mapping,
-            lora_requests=lora_requests,
-            multi_modal_kwargs=multi_modal_kwargs,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-        )
-
-    def prepare_input_tensors(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                query_lens,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, seq_lens, query_lens, self.device,
-                self.pin_memory)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                num_prompts=0,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_kwargs)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_kwargs
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
-        if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
-        else:
-            model_executable = self.model
-
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            **multi_modal_kwargs,
-        )
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return None
-
-        # Sample the next token.
-        output: SamplerOutput = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
-        )
-
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            assert seq_group_metadata_list is not None
-            if seq_group_metadata_list[0].is_prompt:
-                hidden_states = hidden_states.index_select(
-                    0, sampling_metadata.selected_token_indices)
-            output.hidden_states = hidden_states
-
-        return output
+            builder.add_seq_group(seq_group_metadata)
+        return builder.build()  # type: ignore
 
     @torch.inference_mode()
     def profile_run(self) -> None:
@@ -801,7 +820,7 @@ def profile_run(self) -> None:
                     dummy_lora_request = LoRARequest(
                         lora_name=f"warmup_{lora_id}",
                         lora_int_id=lora_id,
-                        lora_local_path="/not/a/real/path",
+                        lora_path="/not/a/real/path",
                     )
                     self.lora_manager.add_dummy_lora(dummy_lora_request,
                                                      rank=LORA_WARMUP_RANK)
@@ -821,22 +840,34 @@ def profile_run(self) -> None:
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
         model_config = self.model_config
-        vlm_config = self.vision_language_config
 
-        if vlm_config:
-            max_num_seqs = min(
-                max_num_seqs,
-                int(max_num_batched_tokens / vlm_config.image_feature_size))
+        if supports_vision(self.model):
+            max_mm_tokens = MULTIMODAL_REGISTRY \
+                .get_max_multimodal_tokens(model_config)
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                               max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
+
+        batch_size = 0
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            batch_size += seq_len
 
-            if vlm_config is None:
-                seq_data = SequenceData([0] * seq_len)
-                dummy_multi_modal_data = None
-            else:
-                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
-                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(seq_data.prompt_token_ids)}")
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
@@ -853,38 +884,86 @@ def profile_run(self) -> None:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        finished_requests_ids = [seq.request_id for seq in seqs]
+        model_input = self.prepare_model_input(
+            seqs, finished_requests_ids=finished_requests_ids)
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                batch_size=batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
         return
 
     def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_loras()
+        self.lora_manager.remove_all_adapters()
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_lora(lora_request)
+        return self.lora_manager.add_adapter(lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_lora(lora_id)
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
 
     def list_loras(self) -> Set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_loras()
+        return self.lora_manager.list_adapters()
+
+    def remove_all_prompt_adapters(self):
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        self.prompt_adapter_manager.remove_all_adapters()
+
+    def set_active_prompt_adapters(
+            self, prompt_adapter_requests: Set[PromptAdapterRequest],
+            prompt_adapter_mapping: PromptAdapterMapping) -> None:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        self.prompt_adapter_manager.set_active_adapters(
+            prompt_adapter_requests, prompt_adapter_mapping)
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.add_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.list_adapters()
 
     @torch.inference_mode()
-    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
+    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         """Cuda graph capture a model.
 
         Note that CUDA graph's performance gain is negligible if number
@@ -917,10 +996,18 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         slot_mapping.fill_(_PAD_SLOT_ID)
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        intermediate_inputs = None
+        if not get_pp_group().is_first_rank:
+            intermediate_inputs = self.model.make_empty_intermediate_tensors(
+                batch_size=max_batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
 
         # Prepare buffer for outputs. These will be reused for all batch sizes.
         # It will be filled after the first graph capture.
-        hidden_states: Optional[torch.Tensor] = None
+        hidden_or_intermediate_states: List[Optional[torch.Tensor]] = [
+            None
+        ] * self.parallel_config.pipeline_parallel_size
 
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
@@ -928,48 +1015,165 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
             bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
         ]
 
+        if self.attn_backend.get_name() == "flashinfer":
+            # For flashinfer, different batch sizes will share the
+            # same workspace buffer.
+            decode_workspace_buffer = \
+            torch.empty(FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                                                dtype=torch.uint8,
+                                              device=self.device)
+            indices_buffer = torch.empty(max_batch_size *
+                                         self.cache_config.num_gpu_blocks,
+                                         dtype=torch.int32,
+                                         device=self.device)
+            indptr_buffer = torch.empty(max_batch_size + 1,
+                                        dtype=torch.int32,
+                                        device=self.device)
+            last_page_len_buffer = torch.empty(max_batch_size,
+                                               dtype=torch.int32,
+                                               device=self.device)
+
         with graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
-            for batch_size in reversed(batch_size_capture_list):
-                # Create dummy attn_metadata.
-                attn_metadata = self.attn_backend.make_metadata(
-                    num_prefills=0,
-                    num_prefill_tokens=0,
-                    num_decode_tokens=batch_size,
-                    slot_mapping=slot_mapping[:batch_size],
-                    seq_lens=None,
-                    seq_lens_tensor=seq_lens[:batch_size],
-                    max_query_len=None,
-                    max_prefill_seq_len=0,
-                    max_decode_seq_len=self.max_seq_len_to_capture,
-                    query_start_loc=None,
-                    seq_start_loc=None,
-                    context_lens_tensor=None,
-                    block_tables=block_tables[:batch_size],
-                    use_cuda_graph=True,
-                )
-
-                if self.lora_config:
-                    lora_mapping = LoRAMapping(
-                        [0] * batch_size,
-                        [0] * batch_size,
-                    )
-                    self.set_active_loras(set(), lora_mapping)
-
-                graph_runner = CUDAGraphRunner(self.model)
-                hidden_states = graph_runner.capture(
-                    input_tokens[:batch_size],
-                    input_positions[:batch_size],
-                    hidden_states[:batch_size]
-                    if hidden_states is not None else None,
-                    kv_caches,
-                    attn_metadata,
-                    memory_pool=self.graph_memory_pool,
-                    stream=graph_capture_context.stream,
-                )
-                self.graph_memory_pool = graph_runner.graph.pool()
-                self.graph_runners[batch_size] = graph_runner
+            for virtual_engine in range(
+                    self.parallel_config.pipeline_parallel_size):
+                for batch_size in reversed(batch_size_capture_list):
+                    if self.attn_backend.get_name() == "flashinfer":
+                        indptr_buffer = indptr_buffer[:batch_size + 1]
+                        last_page_len_buffer = last_page_len_buffer[:
+                                                                    batch_size]
+
+                        num_qo_heads = (
+                            self.model_config.get_num_attention_heads(
+                                self.parallel_config))
+                        num_kv_heads = self.model_config.get_num_kv_heads(
+                            self.parallel_config)
+                        if num_qo_heads // num_kv_heads >= 4:
+                            use_tensor_cores = True
+                        else:
+                            use_tensor_cores = False
+                        decode_wrapper = \
+                            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+                            decode_workspace_buffer, indptr_buffer,
+                            indices_buffer, last_page_len_buffer, "NHD",
+                            use_tensor_cores)
+                        kv_cache_dtype = get_kv_cache_torch_dtype(
+                            self.kv_cache_dtype, self.model_config.dtype)
+
+                        paged_kv_indptr_tensor_host = torch.arange(
+                            0, batch_size + 1, dtype=torch.int32)
+                        paged_kv_indices_tensor_host = torch.arange(
+                            0, batch_size, dtype=torch.int32)
+                        paged_kv_last_page_len_tensor_host = torch.full(
+                            (batch_size, ), self.block_size, dtype=torch.int32)
+                        query_start_loc_host = torch.arange(0,
+                                                            batch_size + 1,
+                                                            dtype=torch.int32)
+
+                        attn_metadata = self.attn_backend.make_metadata(
+                            num_prefills=0,
+                            slot_mapping=slot_mapping[:batch_size],
+                            num_prefill_tokens=0,
+                            num_decode_tokens=batch_size,
+                            max_prefill_seq_len=0,
+                            block_tables=block_tables,
+                            paged_kv_indptr=paged_kv_indptr_tensor_host,
+                            paged_kv_indices=paged_kv_indices_tensor_host,
+                            paged_kv_last_page_len=
+                            paged_kv_last_page_len_tensor_host,
+                            num_qo_heads=num_qo_heads,
+                            num_kv_heads=num_kv_heads,
+                            head_dim=self.model_config.get_head_size(),
+                            page_size=self.block_size,
+                            seq_start_loc=None,
+                            query_start_loc=query_start_loc_host,
+                            device=self.device,
+                            data_type=kv_cache_dtype,
+                            use_cuda_graph=True,
+                            decode_wrapper=decode_wrapper,
+                            prefill_wrapper=None)
+                        attn_metadata.begin_forward()
+                    else:
+                        attn_metadata = self.attn_backend.make_metadata(
+                            num_prefills=0,
+                            num_prefill_tokens=0,
+                            num_decode_tokens=batch_size,
+                            slot_mapping=slot_mapping[:batch_size],
+                            seq_lens=None,
+                            seq_lens_tensor=seq_lens[:batch_size],
+                            max_query_len=None,
+                            max_prefill_seq_len=0,
+                            max_decode_seq_len=self.max_seq_len_to_capture,
+                            query_start_loc=None,
+                            seq_start_loc=None,
+                            context_lens_tensor=None,
+                            block_tables=block_tables[:batch_size],
+                            use_cuda_graph=True,
+                        )
+
+                    if self.lora_config:
+                        lora_mapping = LoRAMapping(
+                            [0] * batch_size,
+                            [0] * batch_size,
+                        )
+                        self.set_active_loras(set(), lora_mapping)
+
+                    if self.prompt_adapter_config:
+                        prompt_adapter_mapping = PromptAdapterMapping(
+                            [-1] * batch_size,
+                            [-1] * batch_size,
+                        )
+                        self.set_active_prompt_adapters(
+                            set(), prompt_adapter_mapping)
+
+                    graph_runner = CUDAGraphRunner(
+                        self.model, self.attn_backend.get_name())
+
+                    if self.attn_backend.get_name() == "flashinfer":
+                        graph_runner.flashinfer_indptr_buffer = indptr_buffer
+                        graph_runner.flashinfer_indices_buffer = indices_buffer
+                        graph_runner.flashinfer_last_page_len_buffer = \
+                            last_page_len_buffer
+                        graph_runner.flashinfer_decode_workspace_buffer = \
+                                decode_workspace_buffer
+                        graph_runner.flashinfer_decode_wrapper = \
+                            decode_wrapper
+
+                    capture_inputs = {
+                        "input_ids":
+                        input_tokens[:batch_size],
+                        "positions":
+                        input_positions[:batch_size],
+                        "hidden_or_intermediate_states":
+                        hidden_or_intermediate_states[
+                            virtual_engine]  # type: ignore
+                        [:batch_size]
+                        if hidden_or_intermediate_states[virtual_engine]
+                        is not None else None,
+                        "intermediate_inputs":
+                        intermediate_inputs[:batch_size]
+                        if intermediate_inputs is not None else None,
+                        "kv_caches":
+                        kv_caches[virtual_engine],
+                        "attn_metadata":
+                        attn_metadata,
+                        "memory_pool":
+                        self.graph_memory_pool,
+                        "stream":
+                        graph_capture_context.stream
+                    }
+                    if self.has_seqlen_agnostic:
+                        # Only used by Mamba-based models CUDA graph atm (Jamba)
+                        capture_inputs.update({
+                            "seqlen_agnostic_capture_inputs":
+                            self.model.get_seqlen_agnostic_capture_inputs(
+                                batch_size)
+                        })
+                    graph_runner.capture(**capture_inputs)
+                    self.graph_memory_pool = graph_runner.graph.pool()
+                    self.graph_runners[virtual_engine][batch_size] = (
+                        graph_runner)
 
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
@@ -981,15 +1185,192 @@ def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
 
+class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
+        ModelInputForGPUWithSamplingMetadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        model_input = \
+            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            )
+        return model_input
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     model_input.seq_lens,
+                                                     model_input.query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError("num_steps > 1 is not supported in ModelRunner")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
+
+        if self.attn_backend.get_name() == "flashinfer":
+            assert model_input.attn_metadata is not None
+            assert model_input.input_tokens is not None
+            if self.flashinfer_decode_workspace_buffer is None:
+                self.flashinfer_decode_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_decode_wrapper = \
+                    BatchDecodeWithPagedKVCacheWrapper(
+                    self.flashinfer_decode_workspace_buffer, "NHD")
+                self.flashinfer_prefill_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_prefill_wrapper = \
+                    BatchPrefillWithPagedKVCacheWrapper(
+                    self.flashinfer_prefill_workspace_buffer, "NHD")
+
+            model_input.attn_metadata.prefill_wrapper = \
+                self.flashinfer_prefill_wrapper
+            if model_input.attn_metadata.use_cuda_graph:
+                batch_size = model_input.input_tokens.shape[0]
+                model_input.attn_metadata.decode_wrapper = self.graph_runners[
+                    model_input.
+                    virtual_engine][batch_size].flashinfer_decode_wrapper
+            else:
+                model_input.attn_metadata.decode_wrapper = \
+                    self.flashinfer_decode_wrapper
+            model_input.attn_metadata.begin_forward()
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        # TODO(andoorve): We can remove this once all
+        # virtual engines share the same kv cache.
+        virtual_engine = model_input.virtual_engine
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
+            model_executable = self.graph_runners[virtual_engine][
+                graph_batch_size]
+        else:
+            model_executable = self.model
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_seqlen_agnostic else {}
+        hidden_or_intermediate_states = model_executable(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **multi_modal_kwargs,
+            **seqlen_agnostic_kwargs)
+
+        # Compute the logits in the last pipeline stage.
+        if not get_pp_group().is_last_rank:
+            return hidden_or_intermediate_states
+
+        logits = self.model.compute_logits(hidden_or_intermediate_states,
+                                           model_input.sampling_metadata)
+
+        if not self.is_driver_worker:
+            return []
+
+        # Sample the next token.
+        output: SamplerOutput = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            assert model_input.sampling_metadata is not None
+            indices = model_input.sampling_metadata.selected_token_indices
+            if model_input.is_prompt:
+                hidden_states = hidden_or_intermediate_states.index_select(
+                    0, indices)
+            elif decode_meta.use_cuda_graph:
+                hidden_states = hidden_or_intermediate_states[:len(indices)]
+            else:
+                hidden_states = hidden_or_intermediate_states
+
+            output.hidden_states = hidden_states
+
+        return [output]
+
+
 class CUDAGraphRunner:
 
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, backend_name: str):
         self.model = model
+        self.backend_name = backend_name
+
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
 
         self._graph: Optional[torch.cuda.CUDAGraph] = None
 
+        self.flashinfer_decode_workspace_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indptr_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indices_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_last_page_len_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_decode_wrapper: Optional[
+            CUDAGraphBatchDecodeWithPagedKVCacheWrapper] = None
+
     @property
     def graph(self):
         assert self._graph is not None
@@ -999,13 +1380,15 @@ def capture(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        hidden_states: Optional[torch.Tensor],
+        hidden_or_intermediate_states: Optional[Union[IntermediateTensors,
+                                                      torch.Tensor]],
+        intermediate_inputs: Optional[IntermediateTensors],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, IntermediateTensors]:
         assert self._graph is None
         # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
@@ -1017,6 +1400,7 @@ def capture(
                 positions,
                 kv_caches,
                 attn_metadata,
+                intermediate_inputs,
                 **kwargs,
             )
         torch.cuda.synchronize()
@@ -1024,34 +1408,61 @@ def capture(
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            output_hidden_states = self.model(
+            output_hidden_or_intermediate_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
+                intermediate_inputs,
                 **kwargs,
             )
-            if hidden_states is not None:
-                hidden_states.copy_(output_hidden_states)
+            if hidden_or_intermediate_states is not None:
+                if get_pp_group().is_last_rank:
+                    hidden_or_intermediate_states.copy_(
+                        output_hidden_or_intermediate_states)
+                else:
+                    for key in hidden_or_intermediate_states.tensors:
+                        hidden_or_intermediate_states[key].copy_(
+                            output_hidden_or_intermediate_states[key])
             else:
-                hidden_states = output_hidden_states
-            del output_hidden_states
+                hidden_or_intermediate_states = (
+                    output_hidden_or_intermediate_states)
+
+            del output_hidden_or_intermediate_states
             # make sure `output_hidden_states` is deleted
             # in the graph's memory pool
             gc.collect()
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
-        self.input_buffers = {
-            "input_ids": input_ids,
-            "positions": positions,
-            "kv_caches": kv_caches,
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-        }
-        self.output_buffers = {"hidden_states": hidden_states}
-        return hidden_states
+        if self.backend_name == "flashinfer":
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+                **kwargs,
+            }
+        else:
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+                "seq_lens_tensor":
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                "block_tables": attn_metadata.decode_metadata.block_tables,
+                **kwargs,
+            }
+        if intermediate_inputs is not None:
+            self.input_buffers.update(intermediate_inputs.tensors)
+        if get_pp_group().is_last_rank:
+            self.output_buffers = {
+                "hidden_states": hidden_or_intermediate_states
+            }
+        else:
+            self.output_buffers = hidden_or_intermediate_states
+        return hidden_or_intermediate_states
 
     def forward(
         self,
@@ -1059,6 +1470,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
         **kwargs,
     ) -> torch.Tensor:
         # KV caches are fixed tensors, so we don't need to copy them.
@@ -1069,15 +1481,29 @@ def forward(
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
         self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                  non_blocking=True)
-        self.input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        self.input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if self.backend_name != "flashinfer":
+            self.input_buffers["seq_lens_tensor"].copy_(
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                non_blocking=True)
+            self.input_buffers["block_tables"].copy_(
+                attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
+            self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
+                                                      **kwargs)
+        if intermediate_tensors is not None:
+            for key in intermediate_tensors.tensors:
+                self.input_buffers[key].copy_(intermediate_tensors[key],
+                                              non_blocking=True)
         # Run the graph.
         self.graph.replay()
-
+        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
+            self.model.copy_outputs_after_cuda_graphs(self.input_buffers,
+                                                      **kwargs)
         # Return the output tensor.
-        return self.output_buffers["hidden_states"]
+        if get_pp_group().is_last_rank:
+            return self.output_buffers["hidden_states"]
+
+        return self.output_buffers
 
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
@@ -1096,15 +1522,3 @@ def _get_graph_batch_size(batch_size: int) -> int:
     else:
         return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
                 _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-
-def _is_block_tables_empty(block_tables: Union[None, Dict]):
-    """
-    Check if block_tables is None or a dictionary with all None values.
-    """
-    if block_tables is None:
-        return True
-    if isinstance(block_tables, dict) and all(
-            value is None for value in block_tables.values()):
-        return True
-    return False
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
new file mode 100644
index 0000000000000..5fb97025af5c0
--- /dev/null
+++ b/vllm/worker/model_runner_base.py
@@ -0,0 +1,178 @@
+import dataclasses
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
+                    TypeVar)
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend
+    from vllm.model_executor import SamplingMetadata
+
+T = TypeVar('T', bound="ModelRunnerInputBase")
+
+
+def _add_attn_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        attn_metadata: Optional["AttentionMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    AttentionMetadata fields.
+    """
+    if attn_metadata is not None:
+        tensor_dict.update(attn_metadata.asdict_zerocopy())
+
+
+def _init_attn_metadata_from_tensor_dict(
+    attn_backend: "AttentionBackend",
+    tensor_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Helper method to initialize AttentionMetadata based on an
+    AttentionBackend and broadcastable AttentionMetadata fields.
+    """
+    # Extract the fields used to create AttentionMetadata.
+    valid_attn_kwargs = {}
+    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
+        val = tensor_dict.pop(field.name, None)
+        if val is not None:
+            valid_attn_kwargs[field.name] = val
+
+    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
+    tensor_dict["attn_metadata"] = attn_metadata
+    return tensor_dict
+
+
+def _init_sampling_metadata_from_tensor_dict(  # type: ignore
+        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Helper method to initialize SamplingMetadata based on broadcastable
+    SamplingMetadata fields.
+    """
+    from vllm.model_executor import SamplingMetadata
+
+    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
+    # An empty SamplingMetadata to signal that the worker should skip
+    # sampling.
+    if selected_token_indices is not None:
+        tensor_dict["sampling_metadata"] = SamplingMetadata(
+            seq_groups=None,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=None,
+            num_prompts=0,
+        )
+    return tensor_dict
+
+
+def _add_sampling_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        sampling_metadata: Optional["SamplingMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    SamplingMetadata fields.
+    """
+    if sampling_metadata is not None:
+        tensor_dict["selected_token_indices"] = (
+            sampling_metadata.selected_token_indices)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelRunnerInputBase(ABC):
+    """Local inputs to each worker's model runner. May contain
+    device-specific data. Different worker backends may have different methods
+    of converting from the global ExecuteModelRequest produced by the LLM
+    engine to the worker-local ModelRunnerInputBase objects.
+
+    Model runners that support multi-GPU execution should define a
+    ModelRunnerInputBase subclass, add their required fields, and specify how to
+    serialize/deserialize a ModelInput for broadcast between workers.
+    """
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        """
+        Extract broadcastable fields. Override for fields that require some
+        custom deserialization.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[T],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> T:
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        ModelRunnerInputBase.
+        """
+        raise NotImplementedError
+
+
+class ModelRunnerInputBuilderBase(ABC, Generic[T]):
+    """A builder to create ModelRunnerInputBase objects.
+  """
+
+    @abstractmethod
+    def add_seq_group(self, seq_group_metadata):
+        """TBA"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def build(self, *args, **kwargs) -> T:
+        """Build metadata with on-device tensors."""
+        raise NotImplementedError
+
+
+class ModelRunnerBase(ABC, Generic[T]):
+    """
+    Model runner interface that abstracts a particular hardware and/or type of
+    model. Model execution may communicate data with model runners in other
+    processes, but it should not include control plane metadata communication.
+
+    Each ModelRunnerBase subclass should define a corresponding
+    ModelRunnerInputBase subclass.
+    """
+
+    @abstractmethod
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> T:
+        """
+        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
+        dict.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> T:
+        """
+        Prepare the inputs to ModelRunnerBase.execute_model from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @current_platform.inference_mode()
+    def execute_model(
+        self,
+        model_input: T,
+        kv_caches: Optional[List[torch.Tensor]],
+        intermediate_tensors: Optional[IntermediateTensors],
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute the model on the given input.
+        """
+        raise NotImplementedError
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a336be04e124f..651319ab14548 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Union)
 
 import torch
 from torch import nn
@@ -8,13 +10,45 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 
-class NeuronModelRunner:
+@dataclass(frozen=True)
+class ModelInputForNeuron(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    input_block_ids: Optional[torch.Tensor] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        raise NotImplementedError("ModelInputForNeuron cannot be broadcast.")
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForNeuron":
+        assert attn_backend is None
+        return cls.from_broadcasted_tensor_dict(tensor_dict)
+
+
+class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
     def __init__(
         self,
@@ -35,6 +69,10 @@ def __init__(
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.
 
@@ -46,13 +84,15 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
+            str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         input_block_ids: List[int] = []
 
         seq_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -72,23 +112,33 @@ def _prepare_prompt(
             assert len(block_table) == 1
             input_block_ids.append(block_table[0])
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                # Process multi-modal data
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         max_seq_len = max(seq_lens)
         assert max_seq_len > 0
         input_tokens = make_tensor_with_pad(input_tokens,
-                                            max_seq_len,
                                             pad=0,
+                                            max_len=max_seq_len,
                                             dtype=torch.long,
                                             device=self.device)
         input_positions = make_tensor_with_pad(input_positions,
-                                               max_seq_len,
                                                pad=0,
+                                               max_len=max_seq_len,
                                                dtype=torch.long,
                                                device=self.device)
         input_block_ids = torch.tensor(input_block_ids,
                                        dtype=torch.long,
                                        device=self.device)
 
-        return input_tokens, input_positions, input_block_ids, seq_lens
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, input_block_ids, seq_lens,
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -121,13 +171,13 @@ def _prepare_decode(
                 input_block_ids.append(block_table[0])
 
         input_tokens = make_tensor_with_pad(input_tokens,
-                                            max_len=1,
                                             pad=0,
+                                            max_len=1,
                                             dtype=torch.long,
                                             device=self.device)
         input_positions = make_tensor_with_pad(input_positions,
-                                               max_len=1,
                                                pad=0,
+                                               max_len=1,
                                                dtype=torch.long,
                                                device=self.device)
         context_lens = torch.tensor(context_lens,
@@ -139,17 +189,24 @@ def _prepare_decode(
 
         return input_tokens, input_positions, input_block_ids
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
+        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
         is_prompt = seq_group_metadata_list[0].is_prompt
         # Prepare input tensors.
         if is_prompt:
-            (input_tokens, input_positions, input_block_ids,
-             seq_lens) = self._prepare_prompt(seq_group_metadata_list)
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
             (input_tokens, input_positions,
              input_block_ids) = self._prepare_decode(seq_group_metadata_list)
@@ -164,32 +221,41 @@ def prepare_input_tensors(
             self.device,
             self.pin_memory)
 
-        return (input_tokens, input_positions, input_block_ids,
-                sampling_metadata)
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, input_block_ids, sampling_metadata
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
 
         hidden_states = self.model(
-            input_ids=input_tokens,
-            positions=input_positions,
-            input_block_ids=input_block_ids,
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
+            **(model_input.multi_modal_kwargs or {}),
         )
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     @property
     def vocab_size(self) -> int:
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index d0e6aaed180e6..f7525e049ee30 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
 """A Neuron worker class."""
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -7,12 +7,13 @@
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
 from vllm.model_executor import set_random_seed
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 
-class NeuronWorker(LoraNotSupportedWorkerBase):
+class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
@@ -34,8 +35,9 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = NeuronModelRunner(model_config, parallel_config,
-                                              scheduler_config, device_config)
+        self.model_runner: NeuronModelRunner = NeuronModelRunner(
+            model_config, parallel_config, scheduler_config, device_config)
+        self.is_driver_worker = True
 
     def init_device(self) -> None:
         # Set random seed.
@@ -73,22 +75,19 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> List[SamplerOutput]:
-        num_seq_groups = len(seq_group_metadata_list)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return False
 
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return None
 
-        output = self.model_runner.execute_model(seq_group_metadata_list)
-
-        # Neuron worker only supports single-step output. Wrap the output in a
-        # list to conform to interface.
-        return [output]
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        return WorkerInput(num_seq_groups=len(
+            execute_model_req.seq_group_metadata_list), )
 
     def get_cache_block_size_bytes(self) -> int:
         """Determine the size in bytes of a cache block.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
new file mode 100644
index 0000000000000..6281cec09825f
--- /dev/null
+++ b/vllm/worker/openvino_model_runner.py
@@ -0,0 +1,344 @@
+from typing import List, Mapping, NamedTuple, Optional, Tuple
+
+import openvino as ov
+import torch
+from torch import nn
+
+from vllm.attention import get_attn_backend
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader.openvino import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+logger = init_logger(__name__)
+
+
+class ModelInput(NamedTuple):
+    input_tokens: torch.Tensor
+    input_positions: torch.Tensor
+    attn_metadata: Optional[OpenVINOAttentionMetadata]
+    seq_lens: List[int]
+    query_lens: List[int]
+    multi_modal_kwargs: Mapping[str, BatchedTensors]
+
+    @classmethod
+    def empty(cls, device):
+        return ModelInput(input_tokens=torch.empty(0, device=device),
+                          input_positions=torch.empty(0, device=device),
+                          attn_metadata=None,
+                          seq_lens=[],
+                          query_lens=[],
+                          multi_modal_kwargs={})
+
+
+class OpenVINOModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.multimodal_config = multimodal_config
+        self.load_config = load_config
+        self.is_driver_worker = is_driver_worker
+
+        self.device = self.device_config.device
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
+
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+
+    def load_model(self) -> None:
+        self.model = get_model(
+            model_config=self.model_config,
+            device_config=self.device_config,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+
+    def _prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInput:
+        """Prepare the model input based on a given sequence group.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        """
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+
+        seq_lens: List[int] = []
+        past_lens: List[int] = []
+        query_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
+
+        subsequence_begins: List[int] = []
+        block_indices: List[int] = []
+        block_indices_begins: List[int] = []
+
+        # initialize beginning of prefix sums
+        subsequence_begins.append(0)
+        block_indices_begins.append(0)
+
+        if len(seq_group_metadata_list) == 0:
+            return ModelInput.empty(self.device)
+
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_id in seq_ids:
+                computed_block_nums = seq_group_metadata.computed_block_nums
+                if (self.scheduler_config is not None
+                        and self.scheduler_config.chunked_prefill_enabled
+                        and not (computed_block_nums is None
+                                 or computed_block_nums == [])):
+                    raise RuntimeError(
+                        "chunked prefill cannot be used with prefix caching "
+                        "now.")
+
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                if is_prompt:
+                    computed_len = seq_data.get_num_computed_tokens()
+                else:
+                    # get_num_computed_tokens is incorrect for spec decoding.
+                    # So, we should have a special logic here.
+                    # TODO(sang): Fix it.
+                    computed_len = seq_data.get_len() - 1
+
+                seq_len = min(
+                    seq_data.get_len(),
+                    computed_len + seq_group_metadata.token_chunk_size,
+                )
+                if is_prompt:
+                    tokens = seq_data.get_token_ids()[computed_len:seq_len]
+                else:
+                    # Optimization. get_token_ids requires the entire copy of
+                    # tokens.
+                    tokens = [seq_data.get_last_token_id()]
+
+                # Prefix cache was hit.
+                # Prefix is not supported with sliding_window
+                prefix_cache_hit = (computed_block_nums is not None
+                                    and len(computed_block_nums) > 0
+                                    and self.sliding_window is None
+                                    and is_prompt)
+
+                mm_data = seq_group_metadata.multi_modal_data
+                if mm_data:
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                    multi_modal_inputs_list.append(mm_kwargs)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                # TODO(sang): Combine chunked prefill and prefix caching by
+                # only allowing multiple of block_size chunk size.
+                # NOTE: This only works for oooooooxxx style attention.
+                if prefix_cache_hit:
+                    assert computed_block_nums is not None
+                    computed_len = len(computed_block_nums) * self.block_size
+                    tokens = tokens[computed_len:]
+                elif (self.scheduler_config.chunked_prefill_enabled
+                      or not is_prompt):
+                    if seq_group_metadata.block_tables is not None:
+                        # chunked prefill or decode
+                        block_table = seq_group_metadata.block_tables[seq_id]
+                        if self.sliding_window is not None:
+                            # chunked prefill doesn't support sliding window.
+                            assert not self.scheduler_config.chunked_prefill_enabled  # noqa: E501
+                            sliding_window_blocks = (self.sliding_window //
+                                                     self.block_size)
+                            block_table = block_table[-sliding_window_blocks:]
+                    else:
+                        # Only happens when memory profiling runs.
+                        block_table = []
+                else:
+                    # prompt phase w/o prefix_caching, chunked_prefill
+                    pass
+
+                block_indices.extend(block_table)
+                block_indices_begins.append(block_indices_begins[-1] +
+                                            len(block_table))
+
+                # TODO(sang): This is a hack to make sliding window work with
+                # paged attn. We can remove it if we make paged attn kernel
+                # to properly handle slinding window attn.
+                if self.sliding_window is not None and not is_prompt:
+                    seq_len = min(seq_len, self.sliding_window)
+                    computed_len = seq_len - 1
+
+                seq_lens.append(seq_len)
+
+                query_len = seq_len - computed_len
+                query_lens.append(query_len)
+
+                input_tokens.extend(tokens)
+                input_positions.extend(list(range(computed_len, seq_len)))
+
+                past_lens.append(computed_len)
+                subsequence_begins.append(subsequence_begins[-1] + query_len)
+
+                if is_prompt:
+                    assert len(seq_ids) == 1
+                else:
+                    assert (
+                        query_len == 1
+                    ), "seq_len: {}, computed_len: {}, query_len: {}".format(
+                        seq_len, computed_len, query_len)
+
+        max_query_len = max(query_lens)
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)  # type: ignore
+
+        past_lens_tensor = torch.tensor(past_lens,
+                                        dtype=torch.int32,
+                                        device=self.device)  # type: ignore
+        subsequence_begins_tensor = torch.tensor(
+            subsequence_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+        block_indices_tensor = torch.tensor(block_indices,
+                                            dtype=torch.int32,
+                                            device=self.device)  # type: ignore
+        block_indices_begins_tensor = torch.tensor(
+            block_indices_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        max_context_len = max(seq_lens)
+        max_context_len_tensor = torch.tensor(
+            max_context_len, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        attn_metadata = self.attn_backend.make_openvino_metadata(
+            past_lens=past_lens_tensor,
+            subsequence_begins=subsequence_begins_tensor,
+            block_indices=block_indices_tensor,
+            block_indices_begins=block_indices_begins_tensor,
+            max_context_len=max_context_len_tensor,
+        )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return ModelInput(
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            multi_modal_kwargs=multi_modal_kwargs,
+        )
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
+               SamplingMetadata, Mapping[str, BatchedTensors]]:
+        # Prepare input tensors.
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            multi_modal_kwargs,
+        ) = self._prepare_model_input(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens,
+            self.device,
+            pin_memory=False,
+        )
+
+        return (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_kwargs,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
+    ) -> Optional[SamplerOutput]:
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_kwargs,
+        ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+            **(multi_modal_kwargs or {}),
+        }
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
new file mode 100644
index 0000000000000..c47f9acc4423d
--- /dev/null
+++ b/vllm/worker/openvino_worker.py
@@ -0,0 +1,354 @@
+"""An OpenVINO worker class."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import openvino as ov
+import torch
+import torch.distributed
+
+from vllm.attention import get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.distributed import (broadcast_tensor_dict,
+                              ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.worker.openvino_model_runner import OpenVINOModelRunner
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+
+logger = init_logger(__name__)
+
+
+class OpenVINOCacheEngine:
+    """Manages the KV cache for OpenVINO backend.
+
+    This class is responsible for initializing and managing CPU KV
+    caches. It also provides methods for performing KV cache operations, such
+    as copying.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
+    ) -> None:
+        assert device_config.device_type == "openvino"
+        self.cache_config = cache_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+
+        self.head_size = model_config.get_head_size()
+        if device_config.device.type == "cpu" and \
+            cache_config.cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            self.head_size += 8
+        self.num_layers = model_config.get_num_layers(parallel_config)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+
+        self.block_size = cache_config.block_size
+        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
+        # for OpenVINO backend, because we want to reuse KV cache management
+        # in the scheduler.
+        self.num_cpu_blocks = cache_config.num_gpu_blocks
+
+        # Get attention backend.
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.head_size,
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+        )
+
+        # Initialize the cache.
+        self.kv_cache: List[Tuple[ov.Tensor,
+                                  ov.Tensor]] = self._allocate_kv_cache(
+                                      self.num_cpu_blocks)
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
+        """Allocates KV cache."""
+        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
+        kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
+        for _ in range(self.num_layers):
+            key_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                   k_block_shape)
+            value_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                     v_block_shape)
+            kv_cache.append((key_blocks, value_blocks))
+        return kv_cache
+
+    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+        self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
+
+    @staticmethod
+    def get_cache_block_size(
+        block_size: int,
+        cache_dtype: ov.Type,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+    ) -> int:
+        head_size = model_config.get_head_size()
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        num_layers = model_config.get_num_layers(parallel_config)
+
+        if cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            head_size += 8
+
+        key_cache_block = block_size * num_kv_heads * head_size
+        value_cache_block = key_cache_block
+        total = num_layers * (key_cache_block + value_cache_block)
+        dtype_size = cache_dtype.size
+        return dtype_size * total
+
+
+class OpenVINOWorker(LoraNotSupportedWorkerBase):
+    """A worker class that executes the model on OpenVINO backend.
+
+    Each worker is associated with a single OpenVINO device. The worker is
+    responsible for maintaining the KV cache and executing the model on the
+    OpenVINO backend.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
+        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.multimodal_config = multimodal_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+
+            init_cached_hf_modules()
+        self.model_runner = OpenVINOModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            multimodal_config=self.multimodal_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: OpenVINOCacheEngine
+        self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
+
+    def init_device(self) -> None:
+        self.init_distributed_environment()
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of blocks available for the KV cache.
+
+        This determines how many KV blocks can fit into the configured
+        KV cache space.
+
+        Note that since vLLM assumes a block resides on GPU if it can be
+        modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
+        This allows us to reuse the scheduler of vLLM without generalizing it
+        to different devices.
+        """
+        # For OpenVINO backend, the block number will be calculated based on the
+        # openvino_kvcache_space_bytes.
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_cpu_blocks = int(self.cache_config.openvino_kvcache_space_bytes //
+                             cache_block_size)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_gpu_blocks = num_cpu_blocks
+        num_cpu_blocks = 0
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache. Currently, swappable CPU memory is not
+        supported.
+
+        Since this worker does not support GPUs, we use the num_gpu_blocks to
+        determine how many non-swappable CPU blocks to allocate.
+        """
+        assert (num_cpu_blocks == 0
+                ), f"{type(self)} does not support swappable cache"
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_cpu_blocks = num_gpu_blocks
+
+        self._validate_num_cpu_blocks(num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_cpu_blocks
+        self.cache_config.num_cpu_blocks = 0
+
+        # Initialize the cache.
+        self._init_cache_engine()
+
+    def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
+        """Raise errors if the num_cpu_blocks is invalid."""
+        if num_cpu_blocks <= 0:
+            raise ValueError(
+                "No available memory for the cache blocks. "
+                "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
+                "initializing the engine.")
+
+        max_seq_len = self.cache_config.block_size * num_cpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
+                "when initializing the engine.")
+
+    def _init_cache_engine(self) -> None:
+        self.cache_engine = OpenVINOCacheEngine(
+            self.cache_config,
+            self.model_config,
+            self.parallel_config,
+            self.device_config,
+        )
+        self.kv_cache = self.cache_engine.kv_cache
+        self.model_runner.block_size = self.cache_engine.block_size
+
+        assert self.kv_cache is not None
+
+        # Populate the cache to warmup the memory
+        for key_cache, value_cache in self.kv_cache:
+            key_cache.data[:] = 0
+            value_cache.data[:] = 0
+
+    def cache_copy(
+        self,
+        blocks_to_copy: List[Tuple[int, int]],
+    ) -> None:
+        self.cache_engine.copy(blocks_to_copy)  # type: ignore
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups: int = len(seq_group_metadata_list)
+            assert execute_model_req is not None
+            blocks_to_copy = execute_model_req.blocks_to_copy
+            assert len(execute_model_req.blocks_to_swap_in) == 0
+            assert len(execute_model_req.blocks_to_swap_out) == 0
+            data: Dict[str, Any] = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_copy": execute_model_req.blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_copy = data["blocks_to_copy"]
+
+        self.cache_copy(blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return []
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.kv_cache)
+
+        # OpenVINO worker only supports single-step execution.
+        return [output]
+
+    def init_distributed_environment(self) -> None:
+        """Initialize the distributed environment."""
+
+        parallel_config = self.parallel_config
+        rank = self.rank
+        distributed_init_method = self.distributed_init_method
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            backend="gloo",
+        )
+
+        # A small all_reduce for warmup.
+        torch.distributed.all_reduce(torch.zeros(1).cpu())
+
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size,
+        )
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size in bytes of a single KV cache block."""
+        return OpenVINOCacheEngine.get_cache_block_size(
+            self.cache_config.block_size,
+            self.cache_config.cache_dtype,
+            self.model_config,
+            self.parallel_config,
+        )
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 5003d3b0ca440..8a8b412db6731 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,5 +1,6 @@
 import time
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import numpy as np
 import torch
@@ -8,21 +9,69 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+                         MultiModalConfig, ParallelConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SamplerOutput, SequenceGroupMetadata,
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           Logprob, SamplerOutput, SequenceGroupMetadata,
                            SequenceOutput)
-from vllm.utils import make_tensor_with_pad
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict)
 
-logger = init_logger(__name__)
-
-_PAD_SLOT_ID = 0  # FIXME(woosuk)
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
+logger = init_logger(__name__)
 
-class TPUModelRunner:
+_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
+# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
+_ENABLE_TOP_P = False
+# FIXME(woosuk): A temporary hack to support `n > 1`.
+# This can significantly affect the performance if too large.
+_MAX_NUM_SAMPLES = 128
+
+
+@dataclass(frozen=True)
+class ModelInputForTPU(ModelRunnerInputBase):
+    token_ids: torch.Tensor
+    position_ids: torch.Tensor
+    attn_metadata: AttentionMetadata
+    input_lens: torch.Tensor
+    t: torch.Tensor
+    p: torch.Tensor
+    num_samples: int
+    best_of: List[int]
+    seq_groups: List[List[int]]
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "token_ids": self.token_ids,
+            "position_ids": self.position_ids,
+            "input_lens": self.input_lens,
+            "t": self.t,
+            "p": self.p,
+            "num_samples": self.num_samples,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["ModelInputForTPU"],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForTPU":
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
     def __init__(
         self,
@@ -32,7 +81,8 @@ def __init__(
         device_config: DeviceConfig,
         cache_config: CacheConfig,
         load_config: LoadConfig,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
+        is_driver_worker: bool = False,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -40,7 +90,8 @@ def __init__(
         self.device_config = device_config
         self.cache_config = cache_config
         self.load_config = load_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
+        self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
         self.max_num_blocks_per_seq = (self.model_config.max_model_len //
@@ -69,9 +120,10 @@ def load_model(self) -> None:
             parallel_config=self.parallel_config,
             cache_config=self.cache_config,
             scheduler_config=self.scheduler_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             lora_config=None,
         )
+        model = model.eval()
         xm.wait_device_ops()
 
         model = ModelWrapper(model)
@@ -139,8 +191,9 @@ def _dummy_run(
         p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
 
         # Dummy run.
-        self.model(token_ids, position_ids, kv_caches, attn_metadata,
-                   input_lens, t, p)
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
+        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
+                   num_samples, kv_caches)
 
     def warmup_model(
         self,
@@ -169,7 +222,7 @@ def warmup_model(
         # Decode
         start = time.time()
         seq_len = 1
-        batch_size = 1
+        batch_size = 8  # Must be in sync with _get_padded_batch_size()
         while True:
             self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False)
             xm.wait_device_ops()
@@ -185,12 +238,12 @@ def warmup_model(
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
         assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
         prompt_lens: List[int] = []
-        slot_mapping: List[List[int]] = []
+        slot_mapping: List[int] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -204,50 +257,46 @@ def _prepare_prompt(
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
 
-            input_tokens.append(prompt_tokens)
-            input_positions.append(list(range(prompt_len)))
+            input_tokens.extend(prompt_tokens)
+            input_positions.extend(list(range(prompt_len)))
 
             assert seq_group_metadata.block_tables is not None
             block_table = seq_group_metadata.block_tables[seq_id]
-            slot_mapping.append([])
             for i in range(prompt_len):
                 block_number = block_table[i // self.block_size]
                 block_offset = i % self.block_size
                 slot = block_number * self.block_size + block_offset
-                slot_mapping[-1].append(slot)
+                slot_mapping.append(slot)
+
+            # Add paddings to EACH prompt to the smallest power of 2 that is
+            # greater than or equal to the prompt length.
+            # We pad the seq_len to reduce the compilation overhead.
+            # We execute each prompt individually (i.e., with batch_size 1)
+            # because the FlashAttention kernel does not support ragged inputs.
+            # TODO(woosuk): Use SplashAttention to support ragged inputs.
+            padded_prompt_len = _get_padded_prefill_len(prompt_len)
+            num_paddings = padded_prompt_len - prompt_len
+            input_tokens += [0] * num_paddings
+            input_positions += [0] * num_paddings
+            slot_mapping += [_PAD_SLOT_ID] * num_paddings
 
         assert len(prompt_lens) > 0
         num_prefills = len(prompt_lens)
-        num_prefill_tokens = sum(prompt_lens)
-
-        # Add paddings to make the shape [batch_size, max_prompt_len] where
-        # max_prompt_len is smallest power of 2 that is greater than or equal
-        # to the maximum prompt length.
-        # We need the 2D input shape because the Pallas FlashAttention kernel
-        # does not support packed 1D inputs.
-        # We pad the seq_len to powers of 2 to reduce the compilation overhead.
-        max_prompt_len = _get_padded_prefill_len(max(prompt_lens))
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            max_prompt_len,
-                                            pad=0,
-                                            dtype=torch.int32,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               max_prompt_len,
-                                               pad=0,
-                                               dtype=torch.int32,
-                                               device=self.device)
-        slot_mapping = make_tensor_with_pad(slot_mapping,
-                                            max_prompt_len,
-                                            pad=_PAD_SLOT_ID,
-                                            dtype=torch.int64,
-                                            device=self.device)
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.int32,
+                                    device="cpu")
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.int32,
+                                       device="cpu")
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.int64,
+                                    device="cpu")
         prompt_lens = torch.tensor(prompt_lens,
                                    dtype=torch.int32,
-                                   device=self.device)
+                                   device="cpu")
         attn_metadata = self.attn_backend.make_metadata(
             num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,  # NOTE: This is not used.
+            num_prefill_tokens=0,  # NOTE: This is not used.
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
             block_tables=None,
@@ -258,20 +307,17 @@ def _prepare_prompt(
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         context_lens: List[int] = []
-        num_seq_groups = len(seq_group_metadata_list)
-        batch_size = _get_padded_batch_size(num_seq_groups)
 
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+        batch_idx = 0
+        for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
-
             seq_ids = list(seq_group_metadata.seq_data.keys())
-
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
                 generation_token = seq_data.get_last_token_id()
@@ -284,14 +330,16 @@ def _prepare_decode(
 
                 assert seq_group_metadata.block_tables is not None
                 block_table = seq_group_metadata.block_tables[seq_id]
-                self.block_tables[i, :len(block_table)] = block_table
+                self.block_tables[batch_idx, :len(block_table)] = block_table
+                batch_idx += 1
 
                 block_number = block_table[position // self.block_size]
                 block_offset = position % self.block_size
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
 
-        num_paddings = batch_size - num_seq_groups
+        batch_size = _get_padded_batch_size(batch_idx)
+        num_paddings = batch_size - batch_idx
         input_tokens = input_tokens + [[0]] * num_paddings
         input_positions = input_positions + [[0]] * num_paddings
         slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
@@ -299,22 +347,22 @@ def _prepare_decode(
 
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.int32,
-                                    device=self.device)
+                                    device="cpu")
         input_positions = torch.tensor(input_positions,
                                        dtype=torch.int32,
-                                       device=self.device)
+                                       device="cpu")
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.int64,
-                                    device=self.device)
+                                    device="cpu")
         context_lens = torch.tensor(context_lens,
                                     dtype=torch.int32,
-                                    device=self.device)
+                                    device="cpu")
         block_tables = torch.tensor(self.block_tables[:batch_size],
                                     dtype=torch.int32,
-                                    device=self.device)
+                                    device="cpu")
         input_lens = torch.tensor([1] * batch_size,
                                   dtype=torch.int32,
-                                  device=self.device)
+                                  device="cpu")
         attn_metadata = self.attn_backend.make_metadata(
             num_prefills=0,
             num_prefill_tokens=0,
@@ -329,124 +377,236 @@ def _prepare_sample(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         padded_batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
         assert len(seq_group_metadata_list) > 0
         t = []
         p = []
+        best_of = []
         for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.sampling_params is not None
             sampling_params = seq_group_metadata.sampling_params
-
+            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
+            # low temperature. This is not accurate.
             t.append(sampling_params.temperature
                      if sampling_params.temperature >= 1e-5 else 1e-5)
+            if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
+                raise NotImplementedError(
+                    "Top-p sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
             p.append(sampling_params.top_p)
-        num_paddings = padded_batch_size - len(seq_group_metadata_list)
+            if sampling_params.top_k != -1:
+                raise NotImplementedError(
+                    "Top-k sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
+            if sampling_params.best_of > _MAX_NUM_SAMPLES:
+                raise NotImplementedError(
+                    f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
+                    "backend.")
+            best_of.append(sampling_params.best_of)
+            if sampling_params.use_beam_search:
+                raise NotImplementedError(
+                    "Beam search is not supported by the TPU backend.")
+            if sampling_params.logprobs is not None:
+                raise NotImplementedError(
+                    "logprobs is not currently supported by the TPU backend.")
+            if sampling_params.prompt_logprobs is not None:
+                raise NotImplementedError(
+                    "prompt_logprobs is not currently supported by the TPU "
+                    "backend.")
+
+            # Repeat the sampling params if the seq group has multiple seqs.
+            num_seqs = len(seq_group_metadata.seq_data)
+            t += [t[-1]] * (num_seqs - 1)
+            p += [p[-1]] * (num_seqs - 1)
+            best_of += [best_of[-1]] * (num_seqs - 1)
+
+        num_paddings = padded_batch_size - len(t)
         t += [1.0] * num_paddings
         p += [1.0] * num_paddings
 
-        t = torch.tensor(t, dtype=torch.float32, device=self.device)
-        p = torch.tensor(p, dtype=torch.float32, device=self.device)
-        return t, p
+        t = torch.tensor(t, dtype=torch.float32, device="cpu")
+        p = torch.tensor(p, dtype=torch.float32, device="cpu")
+        return t, p, best_of
 
-    def prepare_inputs(
+    def prepare_model_input(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ):
-        assert seq_group_metadata_list is not None
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelInputForTPU:
+        del finished_requests_ids  # Unused.
+        assert virtual_engine == 0
         assert len(seq_group_metadata_list) > 0
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
-        if seq_group_metadata_list[0].is_prompt:
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        if is_prompt:
             inputs = self._prepare_prompt(seq_group_metadata_list)
         else:
             inputs = self._prepare_decode(seq_group_metadata_list)
-        padded_batch_size = inputs[0].shape[0]
-        sample_inputs = self._prepare_sample(seq_group_metadata_list,
+        input_tokens, input_positions, attn_metadata, input_lens = inputs
+        padded_batch_size = input_tokens.shape[0]
+        t, p, best_of = self._prepare_sample(seq_group_metadata_list,
                                              padded_batch_size)
-        return inputs + sample_inputs
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
+
+        seq_groups = [
+            list(metadata.seq_data.keys())
+            for metadata in seq_group_metadata_list
+        ]
+        return ModelInputForTPU(input_tokens, input_positions, attn_metadata,
+                                input_lens, t, p, num_samples, best_of,
+                                seq_groups)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU:
+        model_input = ModelInputForTPU.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=self.attn_backend)
+        return model_input
 
-    def _execute_model(
+    def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: ModelInputForTPU,
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> List[CompletionSequenceGroupOutput]:
-        inputs = self.prepare_inputs(seq_group_metadata_list)
-        next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
-                                    *inputs[2:])
-        next_token_ids = next_token_ids.cpu().tolist()
-
-        i = 0
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> List[SamplerOutput]:
+        assert intermediate_tensors is None
+        if num_steps > 1:
+            raise ValueError(
+                "TPUModelRunner does not support multi-step execution.")
+
+        def _execute_model(*args, clone: bool = False) -> torch.Tensor:
+            """Move input args from CPU to device and execute the model."""
+
+            def _copy_to_device(x: torch.Tensor) -> torch.Tensor:
+                if clone:
+                    # When x is a slice of a CPU tensor, XLA may copy the whole
+                    # original tensor to TPU instead of only copying x.
+                    # To avoid this, we copy x after cloning.
+                    x = x.clone()
+                return x.to(self.device)
+
+            new_args = []
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    arg = _copy_to_device(arg)
+                elif isinstance(arg, AttentionMetadata):
+                    arg.slot_mapping = _copy_to_device(arg.slot_mapping)
+                    if getattr(arg, "block_tables", None) is not None:
+                        arg.block_tables = _copy_to_device(arg.block_tables)
+                    if getattr(arg, "context_lens", None) is not None:
+                        arg.context_lens = _copy_to_device(arg.context_lens)
+                new_args.append(arg)
+            return self.model(*new_args)
+
+        num_prefills = model_input.attn_metadata.num_prefills
+        is_prompt = num_prefills > 0
+        if is_prompt:
+            # NOTE(woosuk): Since the FlashAttention kernel does not support
+            # ragged inputs, we split the prompts into different batches and
+            # process them separately. This is a temporary hack that should be
+            # optimized by using SplashAttention.
+            next_token_ids = []
+            orig_slot_mapping = model_input.attn_metadata.slot_mapping
+            batch_size = model_input.input_lens.shape[0]
+            start_idx = 0
+            for i in range(batch_size):
+                # Get the actual prefill_len.
+                prefill_len = model_input.input_lens[i:i + 1].item()
+                prefill_len = _get_padded_prefill_len(prefill_len)
+                end_idx = start_idx + prefill_len
+
+                model_input.attn_metadata.slot_mapping = orig_slot_mapping[
+                    None, start_idx:end_idx]
+                model_input.attn_metadata.num_prefills = 1
+                output_token_ids = _execute_model(
+                    model_input.token_ids[None, start_idx:end_idx],
+                    model_input.position_ids[None, start_idx:end_idx],
+                    model_input.attn_metadata,
+                    model_input.input_lens[i:i + 1],
+                    model_input.t[i:i + 1],
+                    model_input.p[i:i + 1],
+                    model_input.num_samples,
+                    kv_caches,
+                    clone=True)
+                # Retrieve the outputs to CPU.
+                next_token_ids += output_token_ids.cpu().tolist()
+                start_idx = end_idx
+        else:
+            # Execute the model.
+            output_token_ids = _execute_model(
+                model_input.token_ids, model_input.position_ids,
+                model_input.attn_metadata, model_input.input_lens,
+                model_input.t, model_input.p, model_input.num_samples,
+                kv_caches)
+            # Retrieve the outputs to CPU.
+            next_token_ids = output_token_ids.cpu().tolist()
+
+        # NOTE(woosuk): Minimal code to construct the sampler outputs.
+        # The TPU backend does not reuse the sampler, since the TPU backend
+        # does not support the advanced sampling parameters such as logprobs.
+        zero_logprob = Logprob(0.0)
+        batch_idx = 0
         sampler_outputs = []
-        for seq_group_metadata in seq_group_metadata_list:
+        for seq_group in model_input.seq_groups:
+            seq_ids = seq_group
             seq_outputs = []
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            for seq_id in seq_ids:
-                next_token_id = next_token_ids[i]
-                seq_outputs.append(
-                    SequenceOutput(seq_id, next_token_id,
-                                   {next_token_id: Logprob(0.0)}))
-                i += 1
+            if is_prompt:
+                assert len(seq_ids) == 1
+                seq_id = seq_ids[0]
+                for i in range(model_input.best_of[batch_idx]):
+                    next_token_id = next_token_ids[batch_idx][i]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                batch_idx += 1
+            else:
+                for seq_id in seq_ids:
+                    next_token_id = next_token_ids[batch_idx][0]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                    batch_idx += 1
             sampler_outputs.append(
                 CompletionSequenceGroupOutput(seq_outputs, None))
-        return sampler_outputs
-
-    def execute_model(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> SamplerOutput:
-        assert seq_group_metadata_list is not None
-        if seq_group_metadata_list[0].is_prompt:
-            # NOTE(woosuk): To reduce the compilation time, we only compile the
-            # prefill inputs with batch size 1. Because the scheduler is not
-            # aware of this limitation, we need to handle batch size > 1
-            # internally by calling the model multiple times and concatenating
-            # the outputs.
-            # FIXME(woosuk): This is a temporary hack to not change the existing
-            # scheduler. We need to fix this in the future.
-            sampler_outputs = []
-            for seq_group_metadata in seq_group_metadata_list:
-                sampler_outputs += self._execute_model([seq_group_metadata],
-                                                       kv_caches)
-        else:
-            sampler_outputs = self._execute_model(seq_group_metadata_list,
-                                                  kv_caches)
-        return SamplerOutput(sampler_outputs)
+        return [SamplerOutput(sampler_outputs)]
 
 
 class ModelWrapper(nn.Module):
 
     def __init__(self, model: nn.Module):
         super().__init__()
-        self.model = model.eval()
+        self.model = model
 
     def forward(
         self,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
         attn_metadata: AttentionMetadata,
         input_lens: torch.Tensor,
         t: torch.Tensor,
         p: torch.Tensor,
+        num_samples: int,
+        kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
         Args:
             token_ids: The input token IDs of shape [batch_size, seq_len].
             position_ids: The input position IDs of shape [batch_size, seq_len].
-            kv_caches: The key and value caches. They can be None during the
-                memory profiling at initialization.
             attn_metadata: The Pallas attention metadata.
             input_lens: The actual input lengths of shape [batch_size].
             t: The sampling temperature of shape [batch_size].
             p: The top-p probability of shape [batch_size].
+            num_samples: Number of samples to draw from each logits vector.
+            kv_caches: The key and value caches. They can be None during the
+                memory profiling at initialization.
         """
         batch_size, seq_len = token_ids.shape
         # Calculate the positions to sample from.
-        base_indicies = torch.arange(
+        start_indicies = torch.arange(
             batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
-        logits_indices = base_indicies + input_lens - 1
+        logits_indices = start_indicies + input_lens - 1
 
         # FIXME(woosuk): This is a temporary hack to avoid using the existing
         # sampler and sampling metadata.
@@ -488,11 +648,12 @@ def forward(
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
         logits = logits / t.unsqueeze(dim=1)
-        # FIXME(woosuk): Disabled top-p sampling since it's too slow.
-        # logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+        if _ENABLE_TOP_P:
+            logits = _apply_top_p(logits, p.unsqueeze(dim=1))
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        # FIXME(woosuk): best_of > 1 is not supported.
-        next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1)
+        next_token_ids = torch.multinomial(probs,
+                                           num_samples,
+                                           replacement=True)
         return next_token_ids
 
 
@@ -506,11 +667,10 @@ def _get_padded_prefill_len(x: int) -> int:
 
 
 def _get_padded_batch_size(batch_size: int) -> int:
-    if batch_size <= 2:
-        return batch_size
-    elif batch_size <= 4:
-        return 4
-    elif batch_size <= 8:
+    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
+    # To meet this requirement in the simplest way, we set the minimal batch
+    # size to 8.
+    if batch_size <= 8:
         return 8
     else:
         return ((batch_size + 15) // 16) * 16
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 04576015dadbd..03011e03058d8 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,26 +1,28 @@
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch_xla.core.xla_model as xm
+import torch_xla.experimental.dynamo_set_buffer_donor  # noqa: F401
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+                         MultiModalConfig, ParallelConfig, SchedulerConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
 from vllm.worker.tpu_model_runner import TPUModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 logger = init_logger(__name__)
 
 
-class TPUWorker(LoraNotSupportedWorkerBase):
+class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
@@ -30,21 +32,24 @@ def __init__(
         device_config: DeviceConfig,
         cache_config: CacheConfig,
         load_config: LoadConfig,
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
         self.load_config = load_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
 
         assert self.device_config.device_type == "tpu"
         if self.cache_config.cache_dtype == "auto":
@@ -53,10 +58,15 @@ def __init__(
             self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype]
 
-        self.model_runner = TPUModelRunner(model_config, parallel_config,
-                                           scheduler_config, device_config,
-                                           cache_config, load_config,
-                                           vision_language_config)
+        self.model_runner: TPUModelRunner = TPUModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config,
+            multimodal_config,
+            is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
@@ -90,8 +100,7 @@ def init_device(self) -> None:
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): This does not completely eliminate the recompilation
         # overhead because dynamo does not cache the compiled results.
-        xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH),
-                            readonly=False)
+        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
 
     def load_model(self):
         self.model_runner.load_model()
@@ -111,18 +120,26 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
 
-        m = xm.get_memory_info(self.device)
-        program_size = 1024 * 1024 * 1024  # 1GB
-        free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0)
-        kv_cache_bytes = int(free_bytes *
-                             self.cache_config.gpu_memory_utilization)
-        kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype)
+        dtype_btyes = get_dtype_size(self.cache_dtype)
         block_size = self.cache_config.block_size
-        num_tpu_blocks = (kv_cache_bytes //
-                          (kv_cache_dtype_btyes * block_size * num_layers * 2 *
-                           head_size * num_kv_heads))
+        block_size_bytes = (dtype_btyes * block_size * num_layers * 2 *
+                            head_size * num_kv_heads)
+
+        # Calculate the TPU KV cache size based on profiling.
+        m = xm.get_memory_info(self.device)
+        total_memory_size = m["bytes_limit"]
+        usable_memory_size = int(total_memory_size *
+                                 self.cache_config.gpu_memory_utilization)
+        profiled = m["bytes_used"]  # Weights + intermediate activations.
+        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+        num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
         num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
-        return num_tpu_blocks, 0
+
+        # Calculate the CPU KV cache size based on the config.
+        num_cpu_blocks = (self.cache_config.swap_space_bytes //
+                          block_size_bytes)
+        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
+        return num_tpu_blocks, num_cpu_blocks
 
     def initialize_cache(
         self,
@@ -138,15 +155,23 @@ def initialize_cache(
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
 
-        self.tpu_cache = []
+        self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
         tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
             num_gpu_blocks, self.block_size, num_kv_heads, head_size)
+        cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
+            num_cpu_blocks, self.block_size, num_kv_heads, head_size)
         for _ in range(num_layers):
-            key_cache = torch.zeros(tpu_cache_shape,
-                                    dtype=dtype,
-                                    device=self.device)
-            value_cache = torch.zeros_like(key_cache)
-            self.tpu_cache.append((key_cache, value_cache))
+            tpu_k_cache = torch.zeros(tpu_cache_shape,
+                                      dtype=dtype,
+                                      device=self.device)
+            tpu_v_cache = torch.zeros_like(tpu_k_cache)
+            self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
+            cpu_k_cache = torch.zeros(cpu_cache_shape,
+                                      dtype=dtype,
+                                      device="cpu")
+            cpu_v_cache = torch.zeros_like(cpu_k_cache)
+            self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
         self._warmup_model()
 
     def _warmup_model(self) -> None:
@@ -173,26 +198,97 @@ def get_cache_block_size_bytes(self) -> int:
         dtype_size = get_dtype_size(self.cache_dtype)
         return dtype_size * total
 
-    def execute_model(
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        # TODO(woosuk): Support TP.
+        return False
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline
+        # parallelism.
+        return [self.tpu_cache]
+
+    def prepare_worker_input(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        if execute_model_req is None:
-            return []
-
-        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
-        if num_seq_groups == 0:
-            return []
-
-        # Currently, TPUWorker does not support swapping.
-        # TODO(woosuk): Support block copying.
-        assert len(execute_model_req.blocks_to_swap_in) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_swap_out) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_copy) == 0
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.tpu_cache)
-        return [output]
+        execute_model_req: ExecuteModelRequest,
+    ) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        blocks_to_swap_in = _make_src_to_dst(
+            execute_model_req.blocks_to_swap_in, "cpu", self.device)
+        blocks_to_swap_out = _make_src_to_dst(
+            execute_model_req.blocks_to_swap_out, self.device, "cpu")
+        blocks_to_copy = _make_src_to_dst(execute_model_req.blocks_to_copy,
+                                          self.device, self.device)
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+        )
+
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        assert virtual_engine == 0
+        attn_backend = self.model_runner.attn_backend
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+
+        # Issue cache operations.
+        if worker_input.blocks_to_swap_in is not None:
+            src_indices, dst_indices = worker_input.blocks_to_swap_in
+            if src_indices.numel() > 0:
+                # Swap from CPU to TPU.
+                for i in range(num_layers):
+                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                    k = cpu_k_cache[:, src_indices].to(self.device)
+                    v = cpu_v_cache[:, src_indices].to(self.device)
+                    _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache)
+
+        if worker_input.blocks_to_swap_out is not None:
+            src_indices, dst_indices = worker_input.blocks_to_swap_out
+            if src_indices.numel() > 0:
+                # Swap from TPU to CPU.
+                for i in range(num_layers):
+                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                    cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices]
+                    cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices]
+
+        if worker_input.blocks_to_copy is not None:
+            src_indices, dst_indices = worker_input.blocks_to_copy
+            if src_indices.numel() > 0:
+                attn_backend.copy_blocks(self.tpu_cache,
+                                         (src_indices, dst_indices))
+
+
+def _make_src_to_dst(
+    mapping: List[Tuple[int, int]],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    src_indices = [i for i, _ in mapping]
+    dst_indices = [i for _, i in mapping]
+    src_indices = torch.tensor(src_indices,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_indices = torch.tensor(dst_indices,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_indices, dst_indices
+
+
+@torch.compile(backend="openxla")
+def _insert_kv(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    indices: torch.Tensor,
+    tpu_k_cache: torch.Tensor,
+    tpu_v_cache: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True)
+    tpu_k_cache[:, indices] = k
+    tpu_v_cache[:, indices] = v
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e334ffbb755bb..f3c379d1aa34d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,29 +1,31 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Type
 
 import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
+from vllm.platforms import current_platform
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
-from vllm.worker.model_runner import ModelRunner
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 
-class Worker(WorkerBase):
+class Worker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a GPU.
 
     Each worker is associated with a single GPU. The worker is responsible for
@@ -43,12 +45,15 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
@@ -57,30 +62,32 @@ def __init__(
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
         self.load_config = load_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
-        if self.is_driver_worker:
-            assert self.rank == 0, "The driver worker must have rank 0."
-
+        if parallel_config and is_driver_worker:
+            assert rank % parallel_config.tensor_parallel_size == 0, \
+                   "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.vision_language_config = vision_language_config
-        if self.vision_language_config:
-            assert not self.lora_config, (
-                "To be tested: vision language model with LoRA settings.")
+        self.multimodal_config = multimodal_config
 
         # Return hidden states from target model if the draft model is an
         # mlp_speculator
         speculative_args = {} if speculative_config is None \
             or (speculative_config.draft_model_config.model ==
                 model_config.model) \
-              or (speculative_config.draft_model_config.hf_config.model_type !=
-                  "mlp_speculator") else {"return_hidden_states": True}
-
-        ModelRunnerClass = (EmbeddingModelRunner if
-                            self.model_config.embedding_mode else ModelRunner)
-        self.model_runner = ModelRunnerClass(
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator"]) \
+                    else {"return_hidden_states": True}
+
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if model_runner_cls is not None:
+            ModelRunnerClass = model_runner_cls
+        elif self.model_config.embedding_mode:
+            ModelRunnerClass = EmbeddingModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             model_config,
             parallel_config,
             scheduler_config,
@@ -90,14 +97,15 @@ def __init__(
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config,
+            prompt_adapter_config=prompt_adapter_config,
+            multimodal_config=multimodal_config,
             **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CacheEngine
+        self.cache_engine: List[CacheEngine]
         # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.gpu_cache: Optional[List[torch.tensor]] = None
+        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
@@ -213,10 +221,15 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config,
-                                        self.device_config)
-        self.gpu_cache = self.cache_engine.gpu_cache
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
@@ -225,40 +238,19 @@ def _warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
-    def cache_swap(
-        self,
-        blocks_to_swap_in: torch.Tensor,
-        blocks_to_swap_out: torch.Tensor,
-        blocks_to_copy: torch.Tensor,
-    ) -> None:
-        # Issue cache operations.
-        if blocks_to_swap_in.numel() > 0:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-        if blocks_to_swap_out.numel() > 0:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.gpu_cache
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
-        if not self.is_driver_worker:
-            self._execute_model_non_driver()
-            return []
-
-        if execute_model_req is None:
-            # This signals that there's no more requests to process for now.
-            # All workers are running infinite loop with broadcast_tensor_dict,
-            # and it stops the loop when the driver broadcasts an empty input.
-            # Send an empty input to notify all other workers to stop their
-            # execution loop.
-            broadcast_tensor_dict({}, src=0)
-            return []
-
-        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
         # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
         # they contain parameters to launch cudamemcpyasync.
         blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
@@ -273,59 +265,30 @@ def execute_model(
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                       device=self.device,
                                       dtype=torch.int64).view(-1, 2)
-        data: Dict[str, Any] = {
-            "num_seq_groups": num_seq_groups,
-            "blocks_to_swap_in": blocks_to_swap_in,
-            "blocks_to_swap_out": blocks_to_swap_out,
-            "blocks_to_copy": blocks_to_copy,
-        }
-        broadcast_tensor_dict(data, src=0)
-
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
 
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.gpu_cache)
-
-        # Worker only supports single-step execution. Wrap the output in a list
-        # to conform to interface.
-        return [output]
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+        )
 
     @torch.inference_mode()
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        while self._execute_model_non_driver():
-            pass
-
-    def _execute_model_non_driver(self) -> bool:
-        """Execute model in parallel worker.
-
-        Returns True iff there are remaining sequences to process.
-        """
-        assert not self.is_driver_worker
-        data = broadcast_tensor_dict(src=0)
-        if not data:
-            return False
-
-        num_seq_groups = data.get("num_seq_groups", 0)
-        blocks_to_swap_in = data.get("blocks_to_swap_in")
-        blocks_to_swap_out = data.get("blocks_to_swap_out")
-        blocks_to_copy = data.get("blocks_to_copy")
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return False
-
-        self.model_runner.execute_model(None, self.gpu_cache)
-        return True
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
@@ -333,9 +296,25 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_runner.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.remove_lora(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.model_runner.list_prompt_adapters()
+
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len
@@ -371,7 +350,7 @@ def init_worker_distributed_environment(
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
     if torch_dtype == torch.bfloat16:
-        compute_capability = torch.cuda.get_device_capability()
+        compute_capability = current_platform.get_device_capability()
         if compute_capability[0] < 8:
             gpu_name = torch.cuda.get_device_name()
             raise ValueError(
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3d52fd71ec4b8..03e3857e23c4b 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,20 +1,28 @@
+import dataclasses
 import importlib
 import os
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
+import torch
+
+from vllm.distributed import broadcast_tensor_dict, get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.platforms import current_platform
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SamplerOutput)
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
 logger = init_logger(__name__)
 
 
 class WorkerBase(ABC):
     """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware.
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
     """
 
     @abstractmethod
@@ -46,13 +54,23 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
+    @current_platform.inference_mode()
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
+
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        while True:
+            output = self.execute_model(execute_model_req=None)
+            if output is None:
+                return None
+
     @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Executes at least one model step on the given sequences, unless no
-        sequences are provided."""
+    ) -> Optional[List[SamplerOutput]]:
         raise NotImplementedError
 
     @abstractmethod
@@ -70,6 +88,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
@@ -86,25 +108,232 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise ValueError(f"{type(self)} does not support LoRA")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return ValueError(
+            f"{type(self)} does not support LoRA")  # type: ignore
+
     def list_loras(self) -> Set[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
 
 
+@dataclasses.dataclass(frozen=True)
+class WorkerInput:
+    """Local inputs to each worker. May contain device-specific data. These
+    fields should be broadcastable to other workers.
+    """
+
+    num_seq_groups: Optional[int] = None
+    blocks_to_swap_in: Optional[torch.Tensor] = None
+    blocks_to_swap_out: Optional[torch.Tensor] = None
+    blocks_to_copy: Optional[torch.Tensor] = None
+    virtual_engine: int = 0
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["WorkerInput"],
+        tensor_dict: Dict[str, Any],
+    ) -> "WorkerInput":
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        WorkerInput.
+        """
+        return cls(
+            num_seq_groups=tensor_dict.pop("num_seq_groups"),
+            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
+            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
+            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+            virtual_engine=tensor_dict["virtual_engine"],
+        )
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        """
+        Extract broadcastable fields.
+        """
+        tensor_dict = {
+            "num_seq_groups": self.num_seq_groups,
+            "blocks_to_swap_in": self.blocks_to_swap_in,
+            "blocks_to_swap_out": self.blocks_to_swap_out,
+            "blocks_to_copy": self.blocks_to_copy,
+            "virtual_engine": self.virtual_engine,
+        }
+
+        return tensor_dict
+
+
+class LocalOrDistributedWorkerBase(WorkerBase):
+    """
+    Partial implementation of WorkerBase that has a default `execute_model`
+    definition to perform metadata transfer between workers when in distributed
+    mode. Subclasses of this interface should use model runners that inherit
+    from ModelRunnerBase, and should only need to implement worker-local logic.
+    If custom control plane logic is needed to transfer metadata, or if the
+    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
+    """
+    is_driver_worker: bool
+    model_runner: ModelRunnerBase
+
+    @property
+    @abstractmethod
+    def do_metadata_broadcast(self) -> bool:
+        """
+        Used by the default `execute_model` to check whether broadcast is
+        needed to transfer request inputs from the driver worker to other
+        workers in the TP group. If WorkerBase subclass only supports
+        single-worker execution, then this method should return False.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        """
+        Gets the list of kv caches to pass to the worker's model runner. Each
+        element in the list is a kv cache corresponding to a particular virtual
+        engine (PP stream). Used by the default `execute_model`. If the worker's
+        model runner does not follow the ModelRunnerBase interface, then inherit
+        from WorkerBase instead.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        """
+        Prepare the inputs to WorkerBase.execute_worker from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        """
+        Process an execution request.
+        """
+        raise NotImplementedError
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            model_input: ModelRunnerInputBase = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
+            num_steps = execute_model_req.num_steps
+
+            if self.do_metadata_broadcast:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_data["num_steps"] = num_steps
+                broadcast_tensor_dict(broadcast_data, src=0)
+        else:
+            assert self.do_metadata_broadcast
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            num_steps = broadcast_data.pop("num_steps")
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict())
+
+        output = self.model_runner.execute_model(
+            model_input, self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None, intermediate_tensors,
+            num_steps)
+
+        if not get_pp_group().is_last_rank:
+            # output is IntermediateTensors
+            get_pp_group().send_tensor_dict(output.tensors)
+            return [None]
+
+        # output is List[SamplerOutput]
+        return output
+
+    def _execute_model_spmd(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute model in Single Program Multiple Data (SPMD) fashion.
+        All workers take the same request, prepare the input and
+        execute the model.
+        """
+        assert execute_model_req is not None, (
+            "_execute_model_spmd() requires each worker to take in an "
+            "ExecuteModelRequest")
+        worker_input: WorkerInput = self.prepare_worker_input(
+            execute_model_req=execute_model_req)
+        model_input: ModelRunnerInputBase = (
+            self.model_runner.prepare_model_input(
+                execute_model_req.seq_group_metadata_list))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        return self.model_runner.execute_model(
+            model_input, self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None)
+
+
 class WorkerWrapperBase:
     """
     The whole point of this class is to lazily initialize the worker.
     We first instantiate the WorkerWrapper, which remembers the worker module
     and class name. Then, when we call `update_environment_variables`, and the
     real initialization happens in `init_worker`.
+
+    If worker_class_fn is specified, it will be executed to get the worker
+    class.
+    Otherwise, the worker class will be obtained by dynamically importing it
+    using worker_module_name and worker_class_name.
     """
 
-    def __init__(self,
-                 worker_module_name: str,
-                 worker_class_name: str,
-                 trust_remote_code: bool = False) -> None:
+    def __init__(
+        self,
+        worker_module_name: str,
+        worker_class_name: str,
+        trust_remote_code: bool = False,
+        worker_class_fn: Optional[Callable[[],
+                                           Type[WorkerBase]]] = None) -> None:
         self.worker_module_name = worker_module_name
         self.worker_class_name = worker_class_name
-        self.worker = None
+        self.worker_class_fn = worker_class_fn
+        self.worker: Optional[WorkerBase] = None
         if trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
@@ -129,9 +358,14 @@ def init_worker(self, *args, **kwargs):
         # see https://github.com/NVIDIA/nccl/issues/1234
         os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
-        mod = importlib.import_module(self.worker_module_name)
-        worker_class = getattr(mod, self.worker_class_name)
+        if self.worker_class_fn:
+            worker_class = self.worker_class_fn()
+        else:
+            mod = importlib.import_module(self.worker_module_name)
+            worker_class = getattr(mod, self.worker_class_name)
+
         self.worker = worker_class(*args, **kwargs)
+        assert self.worker is not None
 
     def execute_method(self, method, *args, **kwargs):
         try:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f30de703e805d..2f0ca42316e13 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,19 +1,35 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 import torch.nn as nn
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed import broadcast_tensor_dict
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import supports_vision
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -24,7 +40,42 @@
 ]
 
 
-class XPUModelRunner:
+@dataclass(frozen=True)
+class ModelInputForXPU(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["ModelInputForXPU"],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForXPU":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
     def __init__(
         self,
@@ -35,8 +86,9 @@ def __init__(
         cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
@@ -47,7 +99,8 @@ def __init__(
         self.lora_config = lora_config
         self.load_config = load_config
         self.cache_config = cache_config
-        self.vision_language_config = vision_language_config
+        self.prompt_adapter_config = prompt_adapter_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
 
         self.sliding_window = model_config.get_sliding_window()
@@ -56,9 +109,6 @@ def __init__(
 
         self.kv_cache_dtype = kv_cache_dtype
         self.block_size = cache_config.block_size
-        self.max_context_len_to_capture = (
-            self.model_config.max_context_len_to_capture
-            if self.model_config is not None else 0)
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_num_attention_heads(self.parallel_config),
@@ -70,6 +120,10 @@ def __init__(
             self.block_size,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -80,7 +134,7 @@ def load_model(self) -> None:
                 device_config=self.device_config,
                 load_config=self.load_config,
                 lora_config=self.lora_config,
-                vision_language_config=self.vision_language_config,
+                multimodal_config=self.multimodal_config,
                 parallel_config=self.parallel_config,
                 scheduler_config=self.scheduler_config,
                 cache_config=self.cache_config,
@@ -110,12 +164,34 @@ def profile_run(self) -> None:
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
+        model_config = self.model_config
+
+        if supports_vision(self.model):
+            max_mm_tokens = MULTIMODAL_REGISTRY \
+                .get_max_multimodal_tokens(model_config)
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                               max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
+
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            seq_data = SequenceData([0] * seq_len)
-            dummy_multi_modal_data = None
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(seq_data.prompt_token_ids)}")
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -130,16 +206,25 @@ def profile_run(self) -> None:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        model_input = self.prepare_model_input(seqs)
+        self.execute_model(model_input, kv_caches)
         torch.xpu.synchronize()
         return
 
-    def prepare_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[torch.Tensor]]:
-        multi_modal_input = None
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU:
+        return (ModelInputForXPU.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        ))
+
+    def prepare_model_input(
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0,
+            finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForXPU:
+        multi_modal_kwargs = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
             # all decodes.
@@ -147,7 +232,7 @@ def prepare_input_tensors(
             # Prepare input tensors.
             if is_prompt:
                 (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_input
+                 multi_modal_kwargs
                  ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
@@ -168,6 +253,7 @@ def prepare_input_tensors(
                 "input_positions": input_positions,
                 "selected_token_indices":
                 sampling_metadata.selected_token_indices,
+                "multi_modal_kwargs": multi_modal_kwargs,
             }
             metadata_dict.update(attn_metadata.asdict_zerocopy())
             broadcast_tensor_dict(metadata_dict, src=0)
@@ -177,6 +263,7 @@ def prepare_input_tensors(
             input_positions = metadata_dict.pop("input_positions")
             selected_token_indices = metadata_dict.pop(
                 "selected_token_indices")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
             sampling_metadata = SamplingMetadata(
                 seq_groups=None,
@@ -185,8 +272,11 @@ def prepare_input_tensors(
                 num_prompts=0,
             )
 
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_input)
+        return ModelInputForXPU(input_tokens=input_tokens,
+                                input_positions=input_positions,
+                                attn_metadata=attn_metadata,
+                                sampling_metadata=sampling_metadata,
+                                multi_modal_kwargs=multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -245,11 +335,8 @@ def _prepare_decode(
                                        dtype=torch.int,
                                        device=self.device)
 
-        max_block_table_len = max(
-            len(block_table) for block_table in block_tables)
         block_tables = make_tensor_with_pad(
             block_tables,
-            max_len=max_block_table_len,
             pad=0,
             dtype=torch.int,
             device=self.device,
@@ -277,50 +364,52 @@ def _prepare_decode(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: ModelInputForXPU,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "XPUModelRunner does not support multi-step execution.")
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Optional[torch.Tensor]]:
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -341,9 +430,10 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, seq_len)))
 
-            if seq_group_metadata.multi_modal_data:
-                multi_modal_input_list.append(
-                    seq_group_metadata.multi_modal_data.data)
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -373,15 +463,6 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -413,5 +494,9 @@ def _prepare_prompt(
             num_decode_tokens=0,
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_input)
+                multi_modal_kwargs)
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 773ee9f8159e1..6a822c2ba3e7a 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -9,8 +9,9 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -45,8 +46,9 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -54,6 +56,7 @@ def __init__(
 
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
@@ -62,14 +65,12 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        self.vision_language_config = vision_language_config
-        if self.vision_language_config:
-            assert not self.lora_config, (
-                "To be tested: vision language model with LoRA settings.")
+        self.multimodal_config = multimodal_config
 
         self.model_runner = XPUModelRunner(  # type: ignore
             model_config,
@@ -81,12 +82,12 @@ def __init__(
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CacheEngine
-        self.gpu_cache: List[torch.Tensor]
+        self.cache_engine: List[CacheEngine]
+        self.gpu_cache: Optional[List[List[torch.Tensor]]]
 
     def init_device(self) -> None:
         if self.device_config.device.type == "xpu" and is_xpu():