From ce8a19bbf7635d56ac03d969f9e6fc6cbc809cad Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:18:19 -0400
Subject: [PATCH] Remote push refactor (#297)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SUMMARY:

* updated model test structure to focus on core models
* refactored tests to use environment variables (currently at "test
group" level - so each folder has an env variable). All tests are off by
default and they are explicitly enabled
* refactored workflows build-test workflow to use a list of env
variables rather than skip test list

WHY:
* this enables us to be more sane about what is and is not on - as
opposed to a long list of files
* this enables us to actually track what is run and what is not run (via
testmo, which tracks skipped tests)
* this enables us to have more fine-grained control over what is run vs
not run (we can add more env vars at the sub-group level to turn off
more tests)

---------

Signed-off-by: kerthcet <kerthcet@gmail.com>
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Signed-off-by: pandyamarut <pandyamarut@gmail.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
Co-authored-by: Mor Zusman <mor.zusmann@gmail.com>
Co-authored-by: Mor Zusman <morz@ai21.com>
Co-authored-by: Aurick Qiao <aurickq@users.noreply.github.com>
Co-authored-by: Kuntai Du <kuntai@uchicago.edu>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
Co-authored-by: HUANG Fei <hzhwcmhf@gmail.com>
Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Kante Yin <kerthcet@gmail.com>
Co-authored-by: sasha0552 <admin@sasha0552.org>
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: raywanb <112235519+raywanb@users.noreply.github.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Letian Li <lotianmail@gmail.com>
Co-authored-by: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Co-authored-by: Dipika Sikka <ds3822@columbia.edu>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Elisei Smirnov <61423871+kezouke@users.noreply.github.com>
Co-authored-by: Elisei Smirnov <el.smirnov@innopolis.university>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: leiwen83 <leiwen83@users.noreply.github.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Eric Xihui Lin <xihuil.silence@gmail.com>
Co-authored-by: beagleski <yunanzhang@microsoft.com>
Co-authored-by: bapatra <bapatra@microsoft.com>
Co-authored-by: Barun Patra <codedecde@users.noreply.github.com>
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Michał Moskal <michal@moskal.me>
Co-authored-by: Ruth Evans <ruthevans@Ruths-MacBook-Pro.local>
Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>
Co-authored-by: Marut Pandya <pandyamarut@gmail.com>
Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Co-authored-by: Ronen Schaffer <ronen.schaffer@ibm.com>
Co-authored-by: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Co-authored-by: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Co-authored-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Co-authored-by: Breno Faria <breno@veltefaria.de>
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
Co-authored-by: Hyunsung Lee <ita9naiwa@gmail.com>
Co-authored-by: Chansung Park <deep.diver.csp@gmail.com>
Co-authored-by: SnowDist <quxingwei25@gmail.com>
Co-authored-by: functionxu123 <1229853312@qq.com>
Co-authored-by: xuhao <xuhao@cambricon.com>
Co-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>
---
 .../actions/nm-set-env-test-skip/action.yml   |  15 ++
 .github/workflows/nm-build-test.yml           |  30 +--
 .github/workflows/nm-nightly.yml              |   9 +-
 .github/workflows/nm-release.yml              |   8 +-
 .github/workflows/nm-remote-push.yml          |  98 ++++-----
 .github/workflows/nm-test.yml                 |  15 +-
 .github/workflows/nm-weekly.yml               |   2 +-
 neuralmagic/tests/test_skip_env_vars/full.txt |  19 ++
 .../tests/test_skip_env_vars/smoke.txt        |  19 ++
 requirements-dev.txt                          |   2 +
 tests/accuracy/test_lm_eval_correctness.py    |   5 +
 tests/async_engine/test_api_server.py         |   6 +
 tests/async_engine/test_async_llm_engine.py   |   5 +
 tests/async_engine/test_chat_template.py      |   5 +
 tests/async_engine/test_openapi_server_ray.py |   5 +
 tests/async_engine/test_request_tracker.py    |   5 +
 .../test_basic_correctness.py                 |   6 +
 .../test_basic_server_correctness.py          | 188 -----------------
 .../basic_correctness/test_chunked_prefill.py |   7 +
 tests/basic_correctness/test_preemption.py    |   6 +
 tests/conftest.py                             |  14 +-
 tests/core/block/e2e/test_correctness.py      |   5 +
 .../e2e/test_correctness_sliding_window.py    |   5 +
 tests/core/block/test_block_manager_v2.py     |   5 +
 tests/core/block/test_block_table.py          |   5 +
 tests/core/block/test_common.py               |   5 +
 .../block/test_cpu_gpu_block_allocator.py     |   5 +
 tests/core/block/test_naive_block.py          |   5 +
 tests/core/block/test_prefix_caching_block.py |   5 +
 tests/core/test_block_manager.py              |   5 +
 tests/core/test_chunked_prefill_scheduler.py  |   5 +
 tests/core/test_scheduler.py                  |   5 +
 .../test_basic_distributed_correctness.py     |   6 +
 .../test_chunked_prefill_distributed.py       |   6 +
 tests/distributed/test_comm_ops.py            |   5 +
 tests/distributed/test_custom_all_reduce.py   |   5 +
 tests/distributed/test_pynccl.py              |   5 +
 .../output_processor/test_multi_step.py       |   5 +
 .../output_processor/test_stop_checker.py     |   5 +
 tests/engine/test_computed_prefix_blocks.py   |   5 +
 tests/engine/test_detokenization.py           |   5 +
 tests/engine/test_multiproc_workers.py        |   5 +
 tests/engine/test_skip_tokenizer_init.py      |   5 +
 tests/engine/test_stop_reason.py              |   5 +
 tests/engine/test_stop_strings.py             |   5 +
 tests/entrypoints/openai/test_serving_chat.py |   5 +
 tests/entrypoints/test_guided_processors.py   |   5 +
 tests/entrypoints/test_llm_encode.py          |   5 +
 tests/entrypoints/test_llm_generate.py        |   5 +
 tests/entrypoints/test_openai_run_batch.py    |   7 +
 tests/entrypoints/test_openai_server.py       |   5 +
 .../test_server_oot_registration.py           |   5 +
 tests/kernels/test_activation.py              |   5 +
 tests/kernels/test_attention.py               |   5 +
 tests/kernels/test_attention_selector.py      |   5 +
 tests/kernels/test_blocksparse_attention.py   |   5 +
 tests/kernels/test_cache.py                   |   5 +
 tests/kernels/test_cutlass.py                 |   5 +
 tests/kernels/test_flash_attn.py              |   6 +
 tests/kernels/test_int8_quant.py              |   5 +
 tests/kernels/test_layernorm.py               |   5 +
 tests/kernels/test_marlin_gemm.py             |   5 +
 tests/kernels/test_moe.py                     |   5 +
 tests/kernels/test_pos_encoding.py            |   5 +
 tests/kernels/test_prefix_prefill.py          |   5 +
 tests/kernels/test_rand.py                    |   5 +
 tests/kernels/test_sampler.py                 |   5 +
 tests/lora/test_baichuan.py                   |   7 +-
 tests/lora/test_chatglm3.py                   |   7 +
 tests/lora/test_gemma.py                      |   5 +
 tests/lora/test_layer_variation.py            |   5 +
 tests/lora/test_layers.py                     |   5 +
 tests/lora/test_llama.py                      |   5 +
 tests/lora/test_long_context.py               |   5 +
 tests/lora/test_lora.py                       |   5 +
 tests/lora/test_lora_checkpoints.py           |   5 +
 tests/lora/test_lora_manager.py               |   5 +
 tests/lora/test_mixtral.py                    |   5 +
 tests/lora/test_phi.py                        |   7 +
 tests/lora/test_punica.py                     |   5 +
 tests/lora/test_quant_model.py                |   5 +
 tests/lora/test_tokenizer_group.py            |   5 +
 tests/lora/test_utils.py                      |   5 +
 tests/lora/test_worker.py                     |   7 +
 tests/metrics/test_metrics.py                 |   5 +
 tests/model_executor/weight_utils.py          |   6 +
 tests/models/compare_utils.py                 |   1 +
 tests/models/test_aqlm.py                     |   5 +
 tests/models/test_big_models.py               |   6 +
 tests/models/test_compressed.py               |  61 ------
 tests/models/test_embedding.py                |   6 +
 tests/models/test_fp8.py                      |   5 +
 tests/models/test_gptq_marlin.py              |   5 +
 tests/models/test_gptq_marlin_24.py           |   5 +
 tests/models/test_llava.py                    |   5 +
 tests/models/test_marlin.py                   |   5 +
 tests/models/test_mistral.py                  |   6 +
 tests/models/test_models.py                   |   6 +
 tests/models/test_models_logprobs.py          |   5 +
 tests/models/test_oot_registration.py         |   6 +
 tests/models/test_registry.py                 |   5 +
 tests/models_core/__init__.py                 |   0
 tests/models_core/test_llm_logprobs.py        |  57 +++++
 .../test_magic_wand.py}                       |  51 +++--
 tests/models_core/test_server_logprobs.py     | 194 ++++++++++++++++++
 tests/nm_utils/server.py                      |  13 --
 tests/nm_utils/utils_skip.py                  | 134 ++++++++++++
 .../test_disable_sliding_window.py            |   6 +
 tests/prefix_caching/test_prefix_caching.py   |   6 +
 tests/quantization/test_compressed_tensors.py |   6 +
 tests/quantization/test_configs.py            |   5 +
 tests/quantization/test_fp8.py                |   5 +
 tests/samplers/test_beam_search.py            |   6 +
 tests/samplers/test_ignore_eos.py             |   5 +
 tests/samplers/test_logits_processor.py       |   5 +
 tests/samplers/test_logprobs.py               |   5 +
 tests/samplers/test_ranks.py                  |   5 +
 tests/samplers/test_rejection_sampler.py      |   5 +
 tests/samplers/test_sampler.py                |   5 +
 tests/samplers/test_seeded_generate.py        |   5 +
 tests/spec_decode/e2e/test_compatibility.py   |   5 +
 tests/spec_decode/e2e/test_integration.py     |   6 +
 .../spec_decode/e2e/test_integration_dist.py  |   5 +
 tests/spec_decode/e2e/test_logprobs.py        |   5 +
 .../e2e/test_multistep_correctness.py         |   5 +
 .../spec_decode/e2e/test_ngram_correctness.py |   6 +
 tests/spec_decode/test_batch_expansion.py     |   5 +
 tests/spec_decode/test_dynamic_spec_decode.py |  14 ++
 tests/spec_decode/test_metrics.py             |   5 +
 tests/spec_decode/test_multi_step_worker.py   |   5 +
 tests/spec_decode/test_ngram_worker.py        |   6 +
 tests/spec_decode/test_spec_decode_worker.py  |   5 +
 tests/spec_decode/test_utils.py               |   5 +
 tests/tensorizer_loader/test_tensorizer.py    |   5 +
 tests/test_sharded_state_loader.py            |   1 +
 tests/tokenization/test_cached_tokenizer.py   |   6 +
 tests/tokenization/test_detokenize.py         |   5 +
 tests/tokenization/test_tokenizer.py          |   5 +
 tests/tokenization/test_tokenizer_group.py    |   5 +
 tests/worker/test_model_runner.py             |   5 +
 tests/worker/test_swap.py                     |   6 +
 141 files changed, 1208 insertions(+), 365 deletions(-)
 create mode 100644 .github/actions/nm-set-env-test-skip/action.yml
 create mode 100644 neuralmagic/tests/test_skip_env_vars/full.txt
 create mode 100644 neuralmagic/tests/test_skip_env_vars/smoke.txt
 delete mode 100644 tests/basic_correctness/test_basic_server_correctness.py
 delete mode 100644 tests/models/test_compressed.py
 create mode 100644 tests/models_core/__init__.py
 create mode 100644 tests/models_core/test_llm_logprobs.py
 rename tests/{models/test_compressed_memory.py => models_core/test_magic_wand.py} (54%)
 create mode 100644 tests/models_core/test_server_logprobs.py
 create mode 100644 tests/nm_utils/utils_skip.py

diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml
new file mode 100644
index 0000000000000..fb84bc3a6ef9e
--- /dev/null
+++ b/.github/actions/nm-set-env-test-skip/action.yml
@@ -0,0 +1,15 @@
+name: set test skip env vars
+description: 'sets env variables for test skipping. See tests/utils_skip.py'
+inputs:
+  test_skip_env_vars:
+    description: 'file with list of env vars controlling which tests to run.'
+    required: true
+
+runs:
+  using: composite
+  steps:
+  - run: |
+      cat "${ENV_VAR_FILE}" >> $GITHUB_ENV
+    env:
+        ENV_VAR_FILE: ${{ inputs.test_skip_env_vars }}
+    shell: bash
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
index aa58273d33648..fd677c9651a60 100644
--- a/.github/workflows/nm-build-test.yml
+++ b/.github/workflows/nm-build-test.yml
@@ -45,8 +45,8 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      test_skip_list:
-        description: 'file containing tests to skip'
+      test_skip_env_vars:
+        description: 'file with list of env vars controlling which tests to run'
         type: string
         required: true
       # benchmark related parameters
@@ -91,22 +91,22 @@ jobs:
             gitref: ${{ github.ref }}
             python: ${{ inputs.python }}
             whl: ${{ needs.BUILD.outputs.whl }}
-            test_skip_list: ${{ inputs.test_skip_list }}
+            test_skip_env_vars: ${{ inputs.test_skip_env_vars }}
         secrets: inherit
 
     # TODO: re-enable
-    TEST-MULTI:
-        needs: [BUILD]
-        if: success()  # && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
-        uses: ./.github/workflows/nm-test.yml
-        with:
-            test_label: ${{ inputs.test_label_multi }}
-            timeout: ${{ inputs.test_timeout }}
-            gitref: ${{ github.ref }}
-            python: ${{ inputs.python }}
-            whl: ${{ needs.BUILD.outputs.whl }}
-            test_skip_list: ${{ inputs.test_skip_list }}
-        secrets: inherit
+    # TEST-MULTI:
+    #     needs: [BUILD]
+    #     if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
+    #     uses: ./.github/workflows/nm-test.yml
+    #     with:
+    #         test_label: ${{ inputs.test_label_multi }}
+    #         timeout: ${{ inputs.test_timeout }}
+    #         gitref: ${{ github.ref }}
+    #         python: ${{ inputs.python }}
+    #         whl: ${{ needs.BUILD.outputs.whl }}
+    #         test_skip_env_vars: ${{ inputs.test_skip_env_vars }}
+    #     secrets: inherit
 
     UPLOAD:
         needs: [TEST-SOLO]
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
index 2c5dc38dd6322..fecd49c190c46 100644
--- a/.github/workflows/nm-nightly.yml
+++ b/.github/workflows/nm-nightly.yml
@@ -1,4 +1,4 @@
-name: nm Nightly
+name: nm nightly
 run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }}
 on:
     schedule:
@@ -45,7 +45,7 @@ jobs:
             test_label_solo: gcp-k8s-l4-solo
             test_label_multi: ignore
             test_timeout: 480
-            test_skip_list: neuralmagic/tests/skip-for-nightly.txt
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
@@ -63,7 +63,7 @@ jobs:
             test_label_solo: aws-avx2-32G-a10g-24G
             test_label_multi: ignore
             test_timeout: 480
-            test_skip_list: neuralmagic/tests/skip-for-nightly.txt
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
@@ -81,7 +81,8 @@ jobs:
             test_label_solo: gcp-k8s-l4-solo
             test_label_multi: ignore
             test_timeout: 480
-            test_skip_list: neuralmagic/tests/skip-for-nightly.txt
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
+
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml
index b52b9046d0daf..f5c9056cbc5d7 100644
--- a/.github/workflows/nm-release.yml
+++ b/.github/workflows/nm-release.yml
@@ -23,7 +23,7 @@ jobs:
       test_label_solo: gcp-k8s-l4-solo
       test_label_multi: ignore
       test_timeout: 720
-      test_skip_list: neuralmagic/tests/skip-for-release.txt
+      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
       benchmark_label: gcp-k8s-l4-solo
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
@@ -41,7 +41,7 @@ jobs:
       test_label_solo: gcp-k8s-l4-solo
       test_label_multi: ignore
       test_timeout: 720
-      test_skip_list: neuralmagic/tests/skip-for-release.txt
+      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
       benchmark_label: gcp-k8s-l4-solo
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
@@ -59,7 +59,7 @@ jobs:
       test_label_solo: gcp-k8s-l4-solo
       test_label_multi: ignore
       test_timeout: 720
-      test_skip_list: neuralmagic/tests/skip-for-release.txt
+      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
       benchmark_label: gcp-k8s-l4-solo
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
@@ -77,7 +77,7 @@ jobs:
       test_label_solo: gcp-k8s-l4-solo
       test_label_multi: ignore
       test_timeout: 720
-      test_skip_list: neuralmagic/tests/skip-for-release.txt
+      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
       benchmark_label: gcp-k8s-l4-solo
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
index d61618b482f84..3c1fe246756a4 100644
--- a/.github/workflows/nm-remote-push.yml
+++ b/.github/workflows/nm-remote-push.yml
@@ -12,37 +12,37 @@ concurrency:
 
 jobs:
 
-#    BUILD-TEST-3-8:
-#        uses: ./.github/workflows/nm-build-test.yml
-#        with:
-#            python: 3.8.17
-#            gitref: ${{ github.ref }}
-#
-#            test_label_solo: gcp-k8s-l4-solo
-#            test_label_multi: ignore
-#            test_timeout: 480
-#            test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt
-#
-#            benchmark_label: gcp-k8s-l4-solo
-#            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-#            benchmark_timeout: 480
-#        secrets: inherit
-#
-#    BUILD-TEST-3-9:
-#        uses: ./.github/workflows/nm-build-test.yml
-#        with:
-#            python: 3.9.17
-#            gitref: ${{ github.ref }}
-#
-#            test_label_solo: gcp-k8s-l4-solo
-#            test_label_multi: ignore
-#            test_timeout: 480
-#            test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt
-#
-#            benchmark_label: gcp-k8s-l4-solo
-#            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-#            benchmark_timeout: 480
-#        secrets: inherit
+    BUILD-TEST-3-8:
+        uses: ./.github/workflows/nm-build-test.yml
+        with:
+            python: 3.8.17
+            gitref: ${{ github.ref }}
+
+            test_label_solo: gcp-k8s-l4-solo
+            test_label_multi: ignore
+            test_timeout: 480
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
+
+            benchmark_label: gcp-k8s-l4-solo
+            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_timeout: 480
+        secrets: inherit
+
+    BUILD-TEST-3-9:
+        uses: ./.github/workflows/nm-build-test.yml
+        with:
+            python: 3.9.17
+            gitref: ${{ github.ref }}
+
+            test_label_solo: gcp-k8s-l4-solo
+            test_label_multi: ignore
+            test_timeout: 480
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
+
+            benchmark_label: gcp-k8s-l4-solo
+            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_timeout: 480
+        secrets: inherit
 
     BUILD-TEST-3-10:
         uses: ./.github/workflows/nm-build-test.yml
@@ -51,27 +51,27 @@ jobs:
             gitref: ${{ github.ref }}
 
             test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: gcp-k8s-l4-duo
-            test_timeout: 1440
-            test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt
+            test_label_multi: ignore
+            test_timeout: 480
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 480
         secrets: inherit
 
-#    BUILD-TEST-3-11:
-#        uses: ./.github/workflows/nm-build-test.yml
-#        with:
-#            python: 3.11.4
-#            gitref: ${{ github.ref }}
-#
-#            test_label_solo: gcp-k8s-l4-solo
-#            test_label_multi: ignore
-#            test_timeout: 480
-#            test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt
-#
-#            benchmark_label: gcp-k8s-l4-solo
-#            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-#            benchmark_timeout: 480
-#        secrets: inherit
+    BUILD-TEST-3-11:
+        uses: ./.github/workflows/nm-build-test.yml
+        with:
+            python: 3.11.4
+            gitref: ${{ github.ref }}
+
+            test_label_solo: gcp-k8s-l4-solo
+            test_label_multi: ignore
+            test_timeout: 480
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
+
+            benchmark_label: gcp-k8s-l4-solo
+            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_timeout: 480
+        secrets: inherit
diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml
index 3e3f3adef3ef3..87860bcc356bb 100644
--- a/.github/workflows/nm-test.yml
+++ b/.github/workflows/nm-test.yml
@@ -23,8 +23,8 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
-      test_skip_list:
-        description: 'file containing tests to skip'
+      test_skip_env_vars:
+        description: 'file containing tests env vars for test skipping'
         type: string
         required: true
 
@@ -51,8 +51,8 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
-      test_skip_list:
-        description: 'file containing tests to skip'
+      test_skip_env_vars:
+        description: 'file containing tests env vars for test skipping'
         type: string
         required: true
 
@@ -131,12 +131,17 @@ jobs:
             - name: run buildkite script
               run: |
                 cd tests && sudo bash ../.buildkite/download-images.sh
+            
+            - name: setenv test skip
+              id: setenv_test_skip
+              uses: ./.github/actions/nm-set-env-test-skip
+              with:
+                test_skip_env_vars: ${{ inputs.test_skip_env_vars }}
 
             - name: run tests
               id: test
               uses: ./.github/actions/nm-test-whl/
               with:
-                test_skip_list: ${{ inputs.test_skip_list }}
                 test_directory: tests
                 test_results: test-results
 
diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml
index c385e0c6d8510..d92a2619ef359 100644
--- a/.github/workflows/nm-weekly.yml
+++ b/.github/workflows/nm-weekly.yml
@@ -27,7 +27,7 @@ jobs:
       test_label_solo: aws-avx2-32G-a10g-24G
       test_label_multi: aws-avx2-192G-4-a10g-96G
       test_timeout: 480
-      test_skip_list: neuralmagic/tests/skip-for-weekly.txt
+      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
       benchmark_label: aws-avx2-32G-a10g-24G
       benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt
diff --git a/neuralmagic/tests/test_skip_env_vars/full.txt b/neuralmagic/tests/test_skip_env_vars/full.txt
new file mode 100644
index 0000000000000..9c6f69cacd225
--- /dev/null
+++ b/neuralmagic/tests/test_skip_env_vars/full.txt
@@ -0,0 +1,19 @@
+TEST_ACCURACY=DISABLE
+TEST_ASYNC_ENGINE=ENABLE
+TEST_BASIC_CORRECTNESS=ENABLE
+TEST_CORE=ENABLE
+TEST_DISTRIBUTED=DISABLE
+TEST_ENGINE=ENABLE
+TEST_ENTRYPOINTS=ENABLE
+TEST_KERNELS=ENABLE
+TEST_LORA=ENABLE
+TEST_METRICS=ENABLE
+TEST_MODELS=ENABLE
+TEST_MODELS_CORE=ENABLE
+TEST_PREFIX_CACHING=ENABLE
+TEST_QUANTIZATION=ENABLE
+TEST_SAMPLERS=ENABLE
+TEST_SPEC_DECODE=DISABLE
+TEST_TENSORIZER_LOADER=ENABLE
+TEST_TOKENIZATION=ENABLE
+TEST_WORKER=ENABLE
diff --git a/neuralmagic/tests/test_skip_env_vars/smoke.txt b/neuralmagic/tests/test_skip_env_vars/smoke.txt
new file mode 100644
index 0000000000000..5c5066aaee391
--- /dev/null
+++ b/neuralmagic/tests/test_skip_env_vars/smoke.txt
@@ -0,0 +1,19 @@
+TEST_ACCURACY=DISABLE
+TEST_ASYNC_ENGINE=ENABLE
+TEST_BASIC_CORRECTNESS=DISABLE
+TEST_CORE=ENABLE
+TEST_DISTRIBUTED=DISABLE
+TEST_ENGINE=ENABLE
+TEST_ENTRYPOINTS=DISABLE
+TEST_KERNELS=DISABLE
+TEST_LORA=DISABLE
+TEST_METRICS=ENABLE
+TEST_MODELS=DISABLE
+TEST_MODELS_CORE=ENABLE
+TEST_PREFIX_CACHING=ENABLE
+TEST_QUANTIZATION=ENABLE
+TEST_SAMPLERS=DISABLE
+TEST_SPEC_DECODE=DISABLE
+TEST_TENSORIZER_LOADER=DISABLE
+TEST_TOKENIZATION=ENABLE
+TEST_WORKER=ENABLE
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 837ed9d495e10..587387a3d582a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -31,6 +31,8 @@ peft
 requests==2.31
 ray
 sentence-transformers # required for embedding
+optimum     # required for hf gptq baselines
+auto-gptq   # required for hf gptq baselines
 
 # Benchmarking
 aiohttp
diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py
index ded6d98d6f6ad..4c1ac9638a10a 100644
--- a/tests/accuracy/test_lm_eval_correctness.py
+++ b/tests/accuracy/test_lm_eval_correctness.py
@@ -8,6 +8,11 @@
 import yaml
 
 from tests.nm_utils.server import ServerContext
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_ACCURACY"):
+    pytest.skip("TEST_ACCURACY=DISABLE, skipping accuracy test group",
+                allow_module_level=True)
 
 if TYPE_CHECKING:
     import lm_eval as lm_eval_t
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 7f57d5cf9b182..e2cddf228cce7 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -7,6 +7,12 @@
 import pytest
 import requests
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
+    pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
+                allow_module_level=True)
+
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
     response = requests.post("http://localhost:8000/generate",
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 10a46422887e3..77801437e7581 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -3,8 +3,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 
+if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
+    pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
+                allow_module_level=True)
+
 
 @dataclass
 class RequestOutput:
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 55b730812ea94..5e21ed2061a89 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -4,10 +4,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
+    pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
+                allow_module_level=True)
+
 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
     __file__))).parent.parent / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 35d8808b7a699..60c45388c53a8 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -4,8 +4,13 @@
 # and debugging.
 import ray
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from tests.utils import ServerRunner
 
+if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
+    pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
+                allow_module_level=True)
+
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 7b1f4a9e1eb2f..d217db1ba7068 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,8 +1,13 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.engine.async_llm_engine import RequestTracker
 from vllm.outputs import RequestOutput
 
+if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
+    pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group",
+                allow_module_level=True)
+
 
 @pytest.mark.asyncio
 async def test_request_tracker():
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 805b8883b9d94..fadc4998b4091 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -7,8 +7,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM
 
+if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
+    pytest.skip(
+        "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
+        allow_module_level=True)
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
diff --git a/tests/basic_correctness/test_basic_server_correctness.py b/tests/basic_correctness/test_basic_server_correctness.py
deleted file mode 100644
index c33d0aa46c8f2..0000000000000
--- a/tests/basic_correctness/test_basic_server_correctness.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import asyncio
-from os import getenv
-from typing import Dict, List, Type
-
-import openai
-import pytest
-import torch
-from datasets import load_dataset
-from openai import AsyncOpenAI
-from transformers import AutoTokenizer
-
-from tests.conftest import HfRunnerNM
-from tests.models.compare_utils import check_logprobs_close
-from tests.nm_utils.logging import make_logger
-from tests.nm_utils.server import ServerContext
-from vllm.model_executor.layers.quantization import get_quantization_config
-
-
-@pytest.fixture(scope="session")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
-
-
-@pytest.fixture
-def hf_runner_nm() -> Type[HfRunnerNM]:
-    return HfRunnerNM
-
-
-async def my_chat(
-    client,
-    model: str,
-    messages: List[Dict],
-    max_tokens: int,
-    temperature: float,
-    num_logprobs: int,
-):
-    """ submit a single prompt chat and collect results. """
-    return await client.chat.completions.create(model=model,
-                                                messages=messages,
-                                                max_tokens=max_tokens,
-                                                temperature=temperature,
-                                                logprobs=True,
-                                                top_logprobs=num_logprobs)
-
-
-@pytest.mark.parametrize("model, max_model_len, sparsity, gptq_config", [
-    ("mistralai/Mistral-7B-Instruct-v0.2", 4096, None, None),
-    ("neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50", 4096, "sparse_w16a16",
-     None),
-    ("NousResearch/Llama-2-7b-chat-hf", 4096, None, None),
-    ("neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat", 4096,
-     "sparse_w16a16", None),
-    ("microsoft/phi-2", 2048, None, None),
-    ("google/gemma-1.1-2b-it", 2056, None, None),
-    ("HuggingFaceH4/zephyr-7b-gemma-v0.1", 4096, None, None),
-])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("tensor_parallel_size", [None])
-# note: repeating the test for 2 values of tensor_parallel_size
-#  increases the overall execution time by unnecessarily
-#  collecting the HuggingFace runner data twice.
-#  Consider refactoring to eliminate that repeat.
-def test_models_on_server(
-    hf_runner_nm: HfRunnerNM,
-    client: AsyncOpenAI,
-    model: str,
-    max_model_len: int,
-    sparsity: str,
-    gptq_config: str,
-    tensor_parallel_size: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    """
-    This test compares the output of the vllm OpenAI server against that of
-    a HuggingFace transformer.  We expect them to be fairly close.  "Close"
-    is measured by checking that the top 3 logprobs for each token includes
-    the token of the other inference tool.  The first time that there is no
-    exact match, as long as there is a match to one of the top `num_logprobs`
-    logprobs, the test will not proceed further, but will pass.
-
-    Parameters to the test identify a model to test, and key arguments
-    required for that model (see the `max_model_len`, `sparsity` and
-    `gptq_config` params below). The additional parametrizations expand test
-    coverage across the functional space of the server.
-
-    :param hf_runner_nm:  fixture for the HfRunnerNM
-    :param client: fixture with an openai.AsyncOpenAI client
-    :param model:  The Hugginface id for a model to test with
-    :param max_model_len: passed to the vllm Server's --max-model-len option
-    :param sparsity: passed to the vllm Server's --sparsity option
-    :param gptq_config: quantization method id for this model.  default None
-        means quantization isn't involved.
-    :param tensor_parallel_size: passed to the vllm Server's
-        --tensor_parallel_size option
-    :param max_tokens: the total number of tokens to consider for closeness
-    :param num_logprobs:  the total number of logprobs included when
-        calculating closeness
-    """
-    logger = make_logger("vllm_test")
-    # check that the requested gpu count is available in the test env
-    gpu_count = torch.cuda.device_count()
-    if tensor_parallel_size and gpu_count < tensor_parallel_size:
-        pytest.skip(f"gpu count {gpu_count} is insufficient for "
-                    f"tensor_parallel_size = {tensor_parallel_size}")
-
-    # skip this model if the current device does not have the required
-    # gpu capability.
-    device_capability = torch.cuda.get_device_capability()
-    capability = device_capability[0] * 10 + device_capability[1]
-    if gptq_config and (
-            capability <
-            get_quantization_config(gptq_config).get_min_capability()):
-        pytest.skip("insufficient system GPU device capability "
-                    f"({capability}) for this model")
-
-    hf_token = getenv("HF_TOKEN", None)
-    logger.info("loading chat prompts for testing.")
-    ds = load_dataset("nm-testing/qa-chat-prompts", split="train_sft")
-    ds = ds.select(range(20))
-
-    num_chat_turns = 3
-    messages_list = [row["messages"][:num_chat_turns] for row in ds]
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    chat_prompts = [
-        tokenizer.apply_chat_template(messages,
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for messages in messages_list
-    ]
-
-    logger.info("generating chat responses from HuggingFace runner.")
-    hf_model = hf_runner_nm(model, access_token=hf_token)
-    hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens(
-        chat_prompts, max_tokens, num_logprobs, ignore_special_tokens=True)
-
-    del hf_model
-
-    logger.info("generating chat responses from vllm server.")
-    api_server_args = {
-        "--model": model,
-        "--max-model-len": max_model_len,
-        "--disable-log-requests": None,
-    }
-    if sparsity:
-        api_server_args["--sparsity"] = sparsity
-    if tensor_parallel_size:
-        api_server_args["--tensor-parallel-size"] = tensor_parallel_size
-
-    # some devices will require a different `dtype`
-    if device_capability[0] < 8:
-        api_server_args["--dtype"] = "half"
-
-    asyncio_event_loop = asyncio.get_event_loop()
-    temperature = 0.0
-    with ServerContext(api_server_args, logger=logger) as _:
-        # submit an asynchronous request to the server for each prompt
-        chats = [
-            my_chat(client, model, messages, max_tokens, temperature,
-                    num_logprobs)
-            for messages in [query for query in messages_list]
-        ]
-        # await for all the requests to return, and gather their results
-        # in one place
-        results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats))
-
-    logger.info("preparing results from vllm server requests to include "
-                "tokens and logprobs.")
-    vllm_outputs = list()
-    for task_result in results:
-        for req_output in task_result.choices:
-            output_str = req_output.message.content
-            output_tokens = req_output.logprobs.model_extra["tokens"]
-            output_logprobs = req_output.logprobs.model_extra["top_logprobs"]
-            vllm_outputs.append((output_tokens, output_str, output_logprobs))
-
-    logger.info("comparing HuggingFace and vllm Server chat responses")
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf_model",
-        name_1="vllm_model",
-    )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 357bff61ef019..8cb033edd25a3 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,6 +8,13 @@
 """
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
+    pytest.skip(
+        "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
+        allow_module_level=True)
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 7f20b2d934942..f16c1fb48b833 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -8,10 +8,16 @@
 import pytest
 from prometheus_client import REGISTRY
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
 
+if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
+    pytest.skip(
+        "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
+        allow_module_level=True)
+
 MODELS = [
     "facebook/opt-125m",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
index 48c7f8c095f0e..31c6f12f5c030 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -148,6 +148,7 @@ def __init__(
         *,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
+        **kwargs,
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -173,6 +174,7 @@ def __init__(
                     model_name,
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
+                    **kwargs,
                 ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -473,21 +475,21 @@ def _decode_token_by_position_index(
 
     def generate_greedy_logprobs_nm_use_tokens(
         self,
-        prompts: List[str],
+        input_ids_lst: List[torch.Tensor],
         max_tokens: int,
         topk_logprobs_count: int,
-        ignore_special_tokens: bool = False
     ) -> List[Tuple[List[int], str, List[Dict]]]:
         all_logprobs = []
         all_output_tokens = []
         all_output_strs = []
 
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        for input_ids in input_ids_lst:
             output = self.model.generate(
-                input_ids.cuda(),
-                use_cache=True,
+                input_ids,
                 do_sample=False,
+                temperature=None,  # Explicitly set to avoid warning
+                top_p=None,  # Explicitly set to avoid warning
+                top_k=None,  # Explicitly set to avoid warning
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index ad253635e0ba0..604aba39c560d 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -2,10 +2,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
 from .conftest import get_token_ids_from_llm_generator
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index d77d6a1dbb741..37e9af1116c9d 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -3,10 +3,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, SamplingParams
 
 from .conftest import get_text_from_llm_generator
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 # relatively small model with 4k sliding window
 MODEL = "bigcode/starcoder2-3b"
 BLOCK_SIZE = 16
diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index d0ca09c4be0d4..aa059c45323bf 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -1,5 +1,6 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
                                    STR_NOT_IMPL_ENC_DEC_SWA)
 from vllm.core.block_manager_v2 import BlockSpaceManagerV2
@@ -10,6 +11,10 @@
 from ..utils import (create_dummy_prompt, create_seq_group,
                      create_seq_group_encoder_decoder)
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 6fb95cfdfab81..2a1c9945b93dc 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,9 +1,14 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.utils import Device, cdiv, chunk_list
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("sequence_len", [1, 16, 129])
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index cfdd3582ed2ef..957465a2b56dc 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -2,8 +2,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.common import RefCounter
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("seed", list(range(20)))
 @pytest.mark.parametrize("num_incrs", [1, 100])
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index 44a5be6c181a0..a70310906e2f1 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,8 +1,13 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.utils import Device, chunk_list
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("num_cpu_blocks", [0, 512])
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index edcdc0c7d4f98..4e619ee433f85 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -2,9 +2,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 class TestNaiveBlockAllocator:
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bcf08cda09f46..c300345dd7da6 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,10 +5,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
                                                   PrefixCachingBlockAllocator)
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 class TestPrefixCachingBlock:
 
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index cd306b9e4d3cc..17c7f91f01eed 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -4,6 +4,7 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
@@ -16,6 +17,10 @@
 
 from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 def test_block_allocator_allocate():
     block_size = 4
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 3649e6b003a5d..8fce7b4364298 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -3,6 +3,7 @@
 
 import pytest  # noqa
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
@@ -10,6 +11,10 @@
 
 from .utils import create_dummy_prompt
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 def get_sequence_groups(scheduler_output):
     return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 07fc8731e1847..b7960435a6d69 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -5,6 +5,7 @@
 
 import pytest  # noqa
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.policy import PolicyFactory
@@ -14,6 +15,10 @@
 
 from .utils import create_dummy_prompt
 
+if should_skip_test_group(group_name="TEST_CORE"):
+    pytest.skip("TEST_CORE=DISABLE, skipping core test group",
+                allow_module_level=True)
+
 
 def get_sequence_groups(scheduler_output):
     return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index b0576e20e9e1c..aeb4057ebe34e 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -21,6 +21,12 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
 ]
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 204e79e26d513..c8d1147238f52 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -20,6 +20,12 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
 ]
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 2b597bb63c031..894938e874092 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,17 @@
 import ray
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from tests.utils import (init_test_distributed_environment,
                          multi_process_tensor_parallel)
 from vllm.distributed import (broadcast_tensor_dict,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
 
 @ray.remote(num_gpus=1, max_calls=1)
 def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index e4bfda8425344..5f77ed7539979 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed as dist
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from tests.utils import (init_test_distributed_environment,
                          multi_process_tensor_parallel)
 from vllm.distributed.communication_op import (  # noqa
@@ -13,6 +14,10 @@
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
                                              get_tp_ca_communicator)
 
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
 for i, v in enumerate(test_sizes):
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 0218295a3e3f9..b5dca7f8a82fd 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -5,6 +5,7 @@
 import torch
 import torch.distributed
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.distributed.communication_op import (  # noqa
     graph_capture, tensor_model_parallel_all_reduce)
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
@@ -13,6 +14,10 @@
                                              init_distributed_environment)
 from vllm.utils import update_environment_variables
 
+if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
+    pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
+                allow_module_level=True)
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 4f32a622546f0..99189af1b0076 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -4,6 +4,7 @@
 import pytest
 from transformers import PreTrainedTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.scheduler import Scheduler
 from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
 from vllm.engine.output_processor.stop_checker import StopChecker
@@ -15,6 +16,10 @@
 
 from ...core.utils import create_seq_group
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("seq_output_len", [128])
 @pytest.mark.parametrize("num_new_tokens", [1, 12])
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
index f795403e3d8ad..e6af7a3257d44 100644
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -3,10 +3,15 @@
 import pytest
 from transformers import PreTrainedTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob, Sequence, SequenceStatus
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 def sequence_with_eos(text: str, eos_token: str,
                       eos_token_id: int) -> Sequence:
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ed35212cc3f11..1f2c7fd14c4cc 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,9 +1,14 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index f77f6d0725b6b..ae1d5779d72c1 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -1,8 +1,13 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 def test_computed_prefix_blocks(model: str):
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 610ad9732fb91..8b3f68c843cc1 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -6,9 +6,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 class DummyWorker:
     """Dummy version of vllm.worker.worker.Worker"""
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 338b208723ba9..438ae0fc71477 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -1,8 +1,13 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 def test_skip_tokenizer_initialization(model: str):
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index b0bd6c4aa95d3..0a95f2cb59e4d 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -9,8 +9,13 @@
 import pytest
 import transformers
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 MODEL = "facebook/opt-350m"
 STOP_STR = "."
 SEED = 42
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 1584b85aeb064..8a68dceac136c 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -2,8 +2,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import CompletionOutput, LLMEngine, SamplingParams
 
+if should_skip_test_group(group_name="TEST_ENGINE"):
+    pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group",
+                allow_module_level=True)
+
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index c45f02fe564a3..e3168f67e001f 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -3,8 +3,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index fb32a9d155bc0..fe5c709b1db21 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -4,12 +4,17 @@
 import torch
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.openai.protocol import CompletionRequest
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 TEST_SCHEMA = {
     "type": "object",
     "properties": {
diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/test_llm_encode.py
index 7c3fbe43a8384..12a0a1a269ede 100644
--- a/tests/entrypoints/test_llm_encode.py
+++ b/tests/entrypoints/test_llm_encode.py
@@ -3,8 +3,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, EmbeddingRequestOutput, PoolingParams
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 from ..conftest import cleanup
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py
index a00fff91a310e..96b47fb5e170b 100644
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/test_llm_generate.py
@@ -3,10 +3,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, RequestOutput, SamplingParams
 
 from ..conftest import cleanup
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 MODEL_NAME = "facebook/opt-125m"
 
 PROMPTS = [
diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/test_openai_run_batch.py
index 5de28513ca391..6ce7bc08b6cb2 100644
--- a/tests/entrypoints/test_openai_run_batch.py
+++ b/tests/entrypoints/test_openai_run_batch.py
@@ -2,8 +2,15 @@
 import sys
 import tempfile
 
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 # ruff: noqa: E501
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index ac895e04dc1f9..d23705e46be70 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -13,9 +13,14 @@
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from tests.utils import ServerRunner
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py
index 3e55d7f4297fb..394594fcbf085 100644
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/test_server_oot_registration.py
@@ -5,11 +5,16 @@
 import torch
 from openai import OpenAI, OpenAIError
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import ModelRegistry
 from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.utils import get_open_port
 
+if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
+    pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group",
+                allow_module_level=True)
+
 pytestmark = pytest.mark.openai
 
 
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index a4b9f91c7688b..320f6e76bb0b8 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -3,11 +3,16 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
                                                    NewGELU, SiluAndMul)
 
 from .allclose_default import get_default_atol, get_default_rtol
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index fa5c951a7fa7a..458226ce38ccd 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,11 +6,16 @@
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
 from vllm.utils import get_max_shared_memory_bytes, is_hip
 
 from .allclose_default import get_default_atol, get_default_rtol
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 79e03c7478de0..79358a2c47c68 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,8 +5,13 @@
 
 from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
                                  override_backend_env_variable)
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.attention.selector import which_attn_to_use
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 12109f8767782..8a4b7e62a053b 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
@@ -11,6 +12,10 @@
 
 from .allclose_default import get_default_atol, get_default_rtol
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index f26eb896105f6..f7aec1cb5b677 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -4,8 +4,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 276ecf00246c7..a9aeeb3a78bf5 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -7,8 +7,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 22772d4ea4422..4437a5ddc8d7a 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -4,6 +4,12 @@
 import torch
 from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 0daf7439468aa..679b21fc74606 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -3,6 +3,11 @@
 
 # ruff: noqa: F401
 import vllm._C
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index a635e6c12c594..13e4b4febe54b 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,8 +1,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 1f8d94bad26d9..1e59513a1ae48 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
@@ -20,6 +21,10 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index d08410a8bdf65..7fad6e53ee754 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -7,10 +7,15 @@
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.models.mixtral import MixtralMoE
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 
 def torch_moe(a, w1, w2, score, topk):
     B, D = a.shape
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index e564e325112a6..2934959f3d37e 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -4,10 +4,15 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from .allclose_default import get_default_atol, get_default_rtol
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 3534468355d9d..630cf77cd4b51 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -7,9 +7,14 @@
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.attention.backends.xformers import _make_alibi_bias
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
 HEAD_SIZES = [128, 96, 24]
diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py
index 1e38253937ed5..737467e5f6252 100644
--- a/tests/kernels/test_rand.py
+++ b/tests/kernels/test_rand.py
@@ -3,9 +3,14 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 from vllm.model_executor.utils import set_random_seed
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 
 @pytest.mark.skip("C compiler not installed in NM automation. "
                   "This codepath follows a triton pathway, which "
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index feb915932bba5..951590a2278e6 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -5,12 +5,17 @@
 import triton
 import triton.language as tl
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.ops.sample import (
     MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
     sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
 
+if should_skip_test_group(group_name="TEST_KERNELS"):
+    pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group",
+                allow_module_level=True)
+
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
 
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 5ab863eea94b3..825f26ad28892 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,10 +1,15 @@
 import pytest
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
 from .conftest import cleanup
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "baichuan-inc/Baichuan-7B"
 
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
@@ -105,4 +110,4 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
     del llm_tp4
     cleanup()
 
-    assert output_tp1 == output_tp4
\ No newline at end of file
+    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py
index bd8cc98ef8ca0..9cee24c90f972 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -1,6 +1,13 @@
+import pytest
+
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "THUDM/chatglm3-6b"
 
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index bf96de026ae09..0c31726dc0fd0 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,8 +1,13 @@
 import pytest
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "google/gemma-7b"
 
 
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index ace10e389ae6a..712f822d9bed9 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -7,10 +7,15 @@
 from transformers import AutoModelForCausalLM
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
 from .conftest import cleanup
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
 PROMPTS = [
     "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 63fd2cd9e7fbb..a3d9203093fef 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -8,6 +8,7 @@
 import torch
 import torch.nn.functional as F
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import LoRAConfig
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
@@ -38,6 +39,10 @@
 
 from .utils import DummyLoRAManager
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 TOLERANCES = {
     torch.float16: (5e-3, 5e-3),
     torch.float32: (5e-3, 5e-3),
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index 7143a99bea081..ff1d82ba7104f 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -2,10 +2,15 @@
 import ray
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
 from .conftest import cleanup
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
 
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index feb58aa28bda4..793e34bf27e19 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -5,6 +5,7 @@
 import pytest
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
 from vllm.lora.request import LoRARequest
@@ -13,6 +14,10 @@
 
 from .data.long_context_test_data import prompts_and_responses
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 context_len_to_scaling_factor = {
     "16k": 4,
     "32k": 8,
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 3415d36b7e341..34c6941140754 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -1,10 +1,15 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
 
 from .utils import DummyLoRAManager
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
 QKV_TENSOR_SIZES = [
     (8192, 1024, 1024),
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index d4d1665b624ea..9c9a0fea5cb6c 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,8 +1,13 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
 
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c08eee9910149..09a27c90f4768 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -6,6 +6,7 @@
 from safetensors.torch import load_file
 from torch import nn
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
@@ -18,6 +19,10 @@
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
     "lm_head": "output_embeddings",
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 53d49a8dbc813..f7541f271fd98 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -2,8 +2,13 @@
 import torch
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index a2b42ce4cb96f..13636b9be5140 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,6 +1,13 @@
+import pytest
+
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 MODEL_PATH = "microsoft/phi-2"
 
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index f021c003b1322..29b4f9c411e1d 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -4,6 +4,11 @@
 import torch
 
 import vllm.lora.punica as punica
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
 
 
 def assert_close(a, b):
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index feda06b256e04..278acd2dcdb89 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -6,10 +6,15 @@
 import pytest
 
 import vllm
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 
 from .conftest import cleanup
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 
 @dataclass
 class ModelWithQuantization:
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 2dcad23c2b547..ce72a63016732 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,12 +1,17 @@
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import get_lora_tokenizer
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 
 from ..conftest import get_tokenizer_pool_config
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 4ff9715b4ca8d..39fefc33c4c03 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -3,9 +3,14 @@
 import pytest
 from torch import nn
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 from vllm.utils import LRUCache
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 
 def test_parse_fine_tuned_lora_name_valid():
     fixture = {
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 732e91a52c0a9..8c45e15b50d0c 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,12 +3,19 @@
 import tempfile
 from unittest.mock import patch
 
+import pytest
+
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
 
+if should_skip_test_group(group_name="TEST_LORA"):
+    pytest.skip("TEST_LORA=DISABLE, skipping lora test group",
+                allow_module_level=True)
+
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index c1164739eee31..e953f7226b5af 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -3,11 +3,16 @@
 import pytest
 from prometheus_client import REGISTRY
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import EngineArgs, LLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 
+if should_skip_test_group(group_name="TEST_METRICS"):
+    pytest.skip("TEST_METRICS=DISABLE, skipping metrics test group",
+                allow_module_level=True)
+
 MODELS = [
     "facebook/opt-125m",
 ]
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
index c8b9bed691bba..4bd0afdc8ca68 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -5,9 +5,15 @@
 import pytest
 from huggingface_hub.utils import LocalEntryNotFoundError
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, enable_hf_transfer)
 
+if should_skip_test_group(group_name="TEST_MODEL_EXECUTOR"):
+    pytest.skip(
+        "TEST_MODEL_EXECUTOR=DISABLE, skipping model executor test group",
+        allow_module_level=True)
+
 
 def test_hf_transfer_auto_activation():
     if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py
index 051cbf1547b21..337428c6c6535 100644
--- a/tests/models/compare_utils.py
+++ b/tests/models/compare_utils.py
@@ -34,4 +34,5 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
                 # Break out since sequences will now diverge.
                 # as long as we got this far with the output tokens being the
                 # same, or close, the responses are close enough
+                print(f"INFO: BREAK IN CHECK LOGPROBS CLOSE AT IDX: {idx}\n\n")
                 break
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index c4ecf846e633c..e24f2632a28d0 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -6,8 +6,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 aqlm_not_supported = True
 
 if torch.cuda.is_available():
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 48b655e58d602..962af078618fa 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -10,6 +10,12 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
     "mistralai/Mistral-7B-v0.1",
diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
deleted file mode 100644
index 7bd9ae9409847..0000000000000
--- a/tests/models/test_compressed.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Compare the outputs of a sparse model vs sparse model running dense.
-Note: sparse kernels do not have bitwise correctness vs the dense models. 
-As a result, in this test, we just confirm that the top selected tokens of the 
-sparse models are in the top N selections of same model running dense.
-
-Run `pytest tests/models/test_compressed.py`.
-"""
-
-import gc
-
-import pytest
-
-from tests.models.utils import check_logprobs_close
-
-MAX_MODEL_LEN = 1024
-MODEL_FORMAT_PAIRS = [
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
-     "semi_structured_sparse_w16a16"),
-    ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16"),
-]
-
-
-@pytest.mark.parametrize("model_format_pairs", MODEL_FORMAT_PAIRS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_format_pairs,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    model_name, sparsity = model_format_pairs
-
-    sparse_model = vllm_runner(model_name=model_name,
-                               sparsity=sparsity,
-                               dtype=dtype,
-                               max_model_len=MAX_MODEL_LEN)
-    sparse_outputs = sparse_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del sparse_model
-
-    gc.collect()
-
-    dense_model = vllm_runner(model_name=model_name,
-                              sparsity=None,
-                              dtype=dtype,
-                              max_model_len=MAX_MODEL_LEN)
-    dense_outputs = dense_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del dense_model
-
-    # loop through the prompts
-    check_logprobs_close(
-        outputs_0_lst=dense_outputs,
-        outputs_1_lst=sparse_outputs,
-        name_0="dense",
-        name_1="sparse",
-    )
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 6556998b68a74..a7e4fc102cdb9 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -6,6 +6,12 @@
 import torch
 import torch.nn.functional as F
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 MODELS = [
     "intfloat/e5-mistral-7b-instruct",
 ]
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 61aee0d0a6e93..c9daed58db6ae 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -8,9 +8,14 @@
 import torch
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 1ecd27c5ce51e..978704cdc909c 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -14,9 +14,14 @@
 import torch
 
 from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 195c3e5b5863e..b4159ff0a4968 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -12,8 +12,13 @@
 import torch
 
 from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 marlin_not_supported = True
 
 if torch.cuda.is_available():
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a1f0cff1cc0e5..9e288b8d854c0 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -3,10 +3,15 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import VisionLanguageConfig
 
 from ..conftest import IMAGE_FILES
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 pytestmark = pytest.mark.llava
 
 # The image token is placed before "user" on purpose so that the test can pass
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index a3df2890f307c..debf018ca8a80 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -21,8 +21,13 @@
 import torch
 
 from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 marlin_not_supported = True
 
 if torch.cuda.is_available():
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 88f2e97fb8973..24a0de8d464dd 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -4,8 +4,14 @@
 """
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
 from .utils import check_logprobs_close
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
     "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index c838cfcb6913a..d856caec8abdd 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -9,6 +9,12 @@
 """
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 MODELS = [
     "facebook/opt-125m",
     "gpt2",
diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index 4ab78b8fbfe43..621be698a1160 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -5,6 +5,11 @@
 import pytest
 
 from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
 
 MODEL_MAX_LEN = 1024
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 50ab06631500b..fa3f058ed8035 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,9 +1,15 @@
+import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM, ModelRegistry, SamplingParams
 from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 547ab10051f1b..b44e93b9d4fef 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,7 +1,12 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.models import _MODELS, ModelRegistry
 
+if should_skip_test_group(group_name="TEST_MODELS"):
+    pytest.skip("TEST_MODELS=DISABLE, skipping model test group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
diff --git a/tests/models_core/__init__.py b/tests/models_core/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py
new file mode 100644
index 0000000000000..be776637c87f6
--- /dev/null
+++ b/tests/models_core/test_llm_logprobs.py
@@ -0,0 +1,57 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+Because of numerical precision and the fact that we are generating
+over so many samples, we look
+
+Run `pytest tests/models/test_models_logprobs.py`.
+"""
+import pytest
+
+from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS_CORE"):
+    pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group",
+                allow_module_level=True)
+
+MODEL_MAX_LEN = 1024
+
+MODELS = [
+    # Llama (8B param variant)
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    # Qwen2 (7B param variant)
+    "Qwen/Qwen2-7B-Instruct",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner_nm,
+    hf_runner_nm,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    hf_model = hf_runner_nm(model)
+    hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts,
+                                                      max_tokens, num_logprobs)
+
+    del hf_model
+
+    vllm_model = vllm_runner_nm(model, max_model_len=MODEL_MAX_LEN)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    del vllm_model
+
+    # loop through the prompts
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf_model",
+        name_1="vllm_model",
+    )
diff --git a/tests/models/test_compressed_memory.py b/tests/models_core/test_magic_wand.py
similarity index 54%
rename from tests/models/test_compressed_memory.py
rename to tests/models_core/test_magic_wand.py
index 5d6392f4a9e45..a24618ec09a4a 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models_core/test_magic_wand.py
@@ -1,18 +1,21 @@
-"""Checks the memory usage of the sparse model is < memory usage of the
-dense model by checking that the number of KV cache blocks is
-bigger for the sparse model rather than the dense model. vLLM pre-allocates
-the memory for the KV-cache after checking availability once the model
-is loaded. This implies that using a compressed model should give more space
-for the KV cache and thus more allocated blocks.
-
-Run `pytest tests/models/test_sparse_memory.py --forked`.
-"""
+"""Compare the outputs of a sparse model vs sparse model running dense.
+Note: sparse kernels do not have bitwise correctness vs the dense models. 
+As a result, in this test, we just confirm that the top selected tokens of the 
+sparse models are in the top N selections of same model running dense.
 
-import gc
+Run `pytest tests/models/test_compressed.py`.
+"""
 
 import pytest
-import torch
 
+from tests.models.utils import check_logprobs_close
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS_CORE"):
+    pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group",
+                allow_module_level=True)
+
+MAX_MODEL_LEN = 1024
 MODEL_FORMAT_EXTRABLOCKS = [
     ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 1500),
     ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4",
@@ -22,10 +25,15 @@
 
 @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_models(
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_magic_wand(
     vllm_runner,
+    example_prompts,
     model_format_extrablocks,
     dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
     model_name, sparsity, num_extra_blocks = model_format_extrablocks
     dense_model = vllm_runner(model_name=model_name,
@@ -36,10 +44,9 @@ def test_models(
     dense_gpu_alloc = (
         dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator)
     dense_num_kv_blocks = dense_gpu_alloc.num_blocks
-
+    dense_outputs = dense_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
     del dense_model
-    torch.cuda.empty_cache()
-    gc.collect()
 
     sparse_model = vllm_runner(
         model_name=model_name,
@@ -51,12 +58,20 @@ def test_models(
     sparse_gpu_alloc = (
         sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator)
     sparse_num_kv_blocks = sparse_gpu_alloc.num_blocks
-
+    sparse_outputs = sparse_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
     del sparse_model
-    torch.cuda.empty_cache()
-    gc.collect()
 
+    # Confirm the memory is saved.
     assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, (
         f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} "
         f"not bigger than dense model KV cache size {dense_num_kv_blocks} + "
         f"expected num_extra_blocks {num_extra_blocks}")
+
+    # Confirm the generations are similar.
+    check_logprobs_close(
+        outputs_0_lst=dense_outputs,
+        outputs_1_lst=sparse_outputs,
+        name_0="dense",
+        name_1="sparse",
+    )
diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py
new file mode 100644
index 0000000000000..1477192c0ced7
--- /dev/null
+++ b/tests/models_core/test_server_logprobs.py
@@ -0,0 +1,194 @@
+import asyncio
+import gc
+import os
+import time
+from typing import Dict, List, Type
+
+import openai
+import pytest
+import torch
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from transformers import AutoTokenizer
+
+from tests.conftest import HfRunnerNM
+from tests.models.compare_utils import check_logprobs_close
+from tests.nm_utils.logging import make_logger
+from tests.nm_utils.server import ServerContext
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_MODELS_CORE"):
+    pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group",
+                allow_module_level=True)
+
+# Silence warning.
+os.environ["TOKENIZERS_PARALLELISM"] = "True"
+
+NUM_SAMPLES_TO_RUN = 20
+NUM_CHAT_TURNS = 3  # << Should be an odd number.
+REQUEST_RATE = 2.5
+GPU_COUNT = torch.cuda.device_count()
+device_capability = torch.cuda.get_device_capability()
+DEVICE_CAPABILITY = device_capability[0] * 10 + device_capability[1]
+
+MODELS = [
+    # Llama (8B param variant)
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+]
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+@pytest.fixture
+def hf_runner_nm() -> Type[HfRunnerNM]:
+    return HfRunnerNM
+
+
+async def my_chat(
+    client,
+    model: str,
+    messages: List[Dict],
+    max_tokens: int,
+    num_logprobs: int,
+):
+    """ submit a single prompt chat and collect results. """
+    return await client.chat.completions.create(model=model,
+                                                messages=messages,
+                                                max_tokens=max_tokens,
+                                                temperature=0,
+                                                logprobs=True,
+                                                top_logprobs=num_logprobs)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_models_on_server(
+    hf_runner_nm: HfRunnerNM,
+    client: AsyncOpenAI,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    """
+    This test compares the output of the vllm OpenAI server against that of
+    a HuggingFace transformer.  We expect them to be fairly close.  "Close"
+    is measured by checking that the top N logprobs for each token includes
+    the token of the other inference tool.  The first time that there is no
+    exact match, as long as there is a match to one of the top `num_logprobs`
+    logprobs, the test will not proceed further, but will pass.
+
+    :param hf_runner_nm:  fixture for the HfRunnerNM
+    :param client: fixture with an openai.AsyncOpenAI client
+    :param model:  The Hugginface id for a model to test with
+    :param max_tokens: the maximum number of tokens to generate
+    :param num_logprobs: the total number of logprobs checked for "close enough"
+    :param tensor_parallel_size: passed to the vllm Server launch
+    """
+    logger = make_logger("vllm_test")
+
+    # Check that we have enough GPUs to run the test.
+    if tensor_parallel_size > 1 and tensor_parallel_size > GPU_COUNT:
+        pytest.skip(f"gpu count {GPU_COUNT} is insufficient for "
+                    f"tensor_parallel_size = {tensor_parallel_size}")
+
+    # Load dataset.
+    logger.info("Loading dataset and converting to chat format.")
+    ds = load_dataset("nm-testing/qa-chat-prompts",
+                      split="train_sft").select(range(NUM_SAMPLES_TO_RUN))
+    messages_list = [row["messages"][:NUM_CHAT_TURNS] for row in ds]
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    # Note: its very important to tokenize here due to silliness
+    # around how the tokenizer works.
+    #
+    #   The following examples are not equivalent:
+    #
+    #   -----
+    #   prompt = tokenizer.apply_chat_template(message)
+    #   -----
+    #   prompt = tokenizer.apply_chat_template(
+    #       message, tokenize=False)                << adds bos
+    #   input_ids = tokenizer(prompt).input_ids     << also adds bos
+    #   -----
+    input_ids_lst = [
+        tokenizer.apply_chat_template(messages,
+                                      return_tensors="pt",
+                                      add_generation_prompt=True).to("cuda")
+        for messages in messages_list
+    ]
+
+    logger.info("Generating chat responses from HF transformers.")
+    hf_model = hf_runner_nm(model)
+    hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens(
+        input_ids_lst, max_tokens, num_logprobs)
+    # Make sure all the memory is cleaned up.
+    del hf_model
+    torch.cuda.empty_cache()
+    gc.collect()
+    time.sleep(1.0)
+
+    logger.info("Generating chat responses from vLLM server.")
+    api_server_args = {
+        "--model": model,
+        "--max-model-len": 4096,
+        "--tensor-parallel-size": tensor_parallel_size,
+    }
+
+    # bfloat16 requires at least Ampere. Set to float16 otherwise.
+    if DEVICE_CAPABILITY < 80:
+        api_server_args["--dtype"] = "half"
+
+    # TODO: Update this to work like the benchmark script.
+    asyncio_event_loop = asyncio.get_event_loop()
+    with ServerContext(api_server_args, logger=logger) as _:
+        chats = []
+        for messages in messages_list:
+            chats.append(
+                my_chat(client, model, messages, max_tokens, num_logprobs))
+        # Gather results.
+        results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats))
+
+    logger.info("Processing raw data from vLLM server.")
+    vllm_outputs = []
+
+    # See https://platform.openai.com/docs/api-reference/chat/create
+    for result in results:
+        req_output = result.choices[0]
+        output_str = req_output.message.content
+
+        # Unpack from req_output.logprobs.content
+        # logprobs.content                      < list of list of token data
+        # logprobs.content[i].token             < sampled token
+        # logprobs.content[i].top_logprobs      < top logprobs
+        # logprobs.content[i].top_logprobs[j].token
+        # logprobs.content[i].top_logprobs[j].logprob
+
+        output_tokens = []
+        output_logprobs = []
+        for token_data in req_output.logprobs.content:
+            # Actual sampled token.
+            output_tokens.append(token_data.token)
+            # Convert TopLogProb --> List[Dict[token, logprob]]
+            top_logprobs = {}
+            for top_logprob in token_data.top_logprobs:
+                top_logprobs[top_logprob.token] = top_logprob.logprob
+            output_logprobs.append(top_logprobs)
+        vllm_outputs.append((output_tokens, output_str, output_logprobs))
+
+    logger.info("Comparing results.")
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf_model",
+        name_1="vllm_model",
+    )
diff --git a/tests/nm_utils/server.py b/tests/nm_utils/server.py
index 1cff5a42176dc..989e9c053740a 100644
--- a/tests/nm_utils/server.py
+++ b/tests/nm_utils/server.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import shlex
 import subprocess
 import sys
 import time
@@ -10,8 +9,6 @@
 import requests
 import torch
 
-from tests.nm_utils.logging import log_banner
-
 MAX_SERVER_START_WAIT = 15 * 60  # time (seconds) to wait for server to start
 
 
@@ -31,14 +28,6 @@ def __init__(self,
             *args,
         ]
 
-        if logger:
-            log_banner(
-                logger,
-                "server startup command",
-                shlex.join(self.startup_command),
-                logging.DEBUG,
-            )
-
         self.proc = subprocess.Popen(
             [
                 sys.executable, "-m", "vllm.entrypoints.openai.api_server",
@@ -95,8 +84,6 @@ def __init__(self, args: Dict[str, str], *,
     def __enter__(self):
         """Executes the server process and waits for it to become ready."""
         ray.init(ignore_reinit_error=True)
-        log_banner(self._logger, "server startup command args",
-                   shlex.join(self._args))
 
         try:
             self.server_runner = ServerRunner.remote(self._args,
diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py
new file mode 100644
index 0000000000000..cca1b85d87049
--- /dev/null
+++ b/tests/nm_utils/utils_skip.py
@@ -0,0 +1,134 @@
+"""Checks environment variables to skip various test groups.
+The functions here are imported by each test file.
+The .github/actions/nm-test-skipping-env-setup sets these
+    variables in the testing automation.
+"""
+
+import os
+
+
+def should_skip_accuracy_test_group():
+    TEST_ACCURACY = os.getenv("TEST_ACCURACY", "ENABLE")
+    return TEST_ACCURACY == "DISABLE"
+
+
+def should_skip_async_engine_test_group():
+    TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "ENABLE")
+    return TEST_ASYNC_ENGINE == "DISABLE"
+
+
+def should_skip_basic_correctness_test_group():
+    TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "ENABLE")
+    return TEST_BASIC_CORRECTNESS == "DISABLE"
+
+
+def should_skip_core_test_group():
+    TEST_CORE = os.getenv("TEST_CORE", "ENABLE")
+    return TEST_CORE == "DISABLE"
+
+
+def should_skip_distributed_test_group():
+    TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "ENABLE")
+    return TEST_DISTRIBUTED == "DISABLE"
+
+
+def should_skip_engine_test_group():
+    TEST_ENGINE = os.getenv("TEST_ENGINE", "ENABLE")
+    return TEST_ENGINE == "DISABLE"
+
+
+def should_skip_entrypoints_test_group():
+    TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "ENABLE")
+    return TEST_ENTRYPOINTS == "DISABLE"
+
+
+def should_skip_kernels_test_groups():
+    TEST_KERNELS = os.getenv("TEST_KERNELS", "ENABLE")
+    return TEST_KERNELS == "DISABLE"
+
+
+def should_skip_lora_test_group():
+    TEST_LORA = os.getenv("TEST_LORA", "ENABLE")
+    return TEST_LORA == "DISABLE"
+
+
+def should_skip_metrics_test_group():
+    TEST_METRICS = os.getenv("TEST_METRICS", "ENABLE")
+    return TEST_METRICS == "DISABLE"
+
+
+def should_skip_model_executor_test_group():
+    TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "ENABLE")
+    return TEST_MODEL_EXECUTOR == "DISABLE"
+
+
+def should_skip_models_test_group():
+    TEST_MODELS = os.getenv("TEST_MODELS", "ENABLE")
+    return TEST_MODELS == "DISABLE"
+
+
+def should_skip_models_core_test_group():
+    TEST_MODELS_CORE = os.getenv("TEST_MODELS_CORE", "ENABLE")
+    return TEST_MODELS_CORE == "DISABLE"
+
+
+def should_skip_prefix_caching_test_group():
+    TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "ENABLE")
+    return TEST_PREFIX_CACHING == "DISABLE"
+
+
+def should_skip_quantization_test_group():
+    TEST_QUANTIZATION = os.getenv("TEST_QUANTIZATION", "ENABLE")
+    return TEST_QUANTIZATION == "DISABLE"
+
+
+def should_skip_samplers_test_group():
+    TEST_SAMPLERS = os.getenv("TEST_SAMPLERS", "ENABLE")
+    return TEST_SAMPLERS == "DISABLE"
+
+
+def should_skip_spec_decode_test_group():
+    TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "ENABLE")
+    return TEST_SPEC_DECODE == "DISABLE"
+
+
+def should_skip_tensorizer_loader_test_group():
+    TEST_TENSORIZER_LOADER = os.getenv("TEST_TENSORIZER_LOADER", "ENABLE")
+    return TEST_TENSORIZER_LOADER == "DISABLE"
+
+
+def should_skip_tokenization_test_group():
+    TEST_TOKENIZATION = os.getenv("TEST_TOKENIZATION", "ENABLE")
+    return TEST_TOKENIZATION == "DISABLE"
+
+
+def should_skip_worker_test_group():
+    TEST_WORKER = os.getenv("TEST_WORKER", "ENABLE")
+    return TEST_WORKER == "DISABLE"
+
+
+MAP = {
+    "TEST_ACCURACY": should_skip_accuracy_test_group,
+    "TEST_ASYNC_ENGINE": should_skip_async_engine_test_group,
+    "TEST_BASIC_CORRECTNESS": should_skip_basic_correctness_test_group,
+    "TEST_CORE": should_skip_core_test_group,
+    "TEST_DISTRIBUTED": should_skip_distributed_test_group,
+    "TEST_ENGINE": should_skip_engine_test_group,
+    "TEST_ENTRYPOINTS": should_skip_entrypoints_test_group,
+    "TEST_KERNELS": should_skip_kernels_test_groups,
+    "TEST_LORA": should_skip_lora_test_group,
+    "TEST_METRICS": should_skip_metrics_test_group,
+    "TEST_MODELS": should_skip_models_test_group,
+    "TEST_MODELS_CORE": should_skip_models_core_test_group,
+    "TEST_PREFIX_CACHING": should_skip_prefix_caching_test_group,
+    "TEST_QUANTIZATION": should_skip_quantization_test_group,
+    "TEST_SAMPLERS": should_skip_samplers_test_group,
+    "TEST_SPEC_DECODE": should_skip_spec_decode_test_group,
+    "TEST_TENSORIZER_LOADER": should_skip_tensorizer_loader_test_group,
+    "TEST_TOKENIZATION": should_skip_tokenization_test_group,
+    "TEST_WORKER": should_skip_worker_test_group,
+}
+
+
+def should_skip_test_group(group_name: str) -> bool:
+    return MAP[group_name]()
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index eeac6ab43c05f..1e2dc9197b403 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -5,8 +5,14 @@
 import pytest
 
 from tests.conftest import cleanup
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import LLM
 
+if should_skip_test_group(group_name="TEST_PREFIX_CACHING"):
+    pytest.skip(
+        "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group",
+        allow_module_level=True)
+
 MODEL_LEN_LEN = [
     # Example models with sliding window.
     ("bigcode/starcoder2-3b", 4096, 16384),
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 305596e16ef1c..7c3be3a1367b2 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,9 +4,15 @@
 """
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device
 
+if should_skip_test_group(group_name="TEST_PREFIX_CACHING"):
+    pytest.skip(
+        "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group",
+        allow_module_level=True)
+
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_blocks", [16])
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e6d8218b41372..510175146910d 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -3,13 +3,19 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
 
+import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
     CompressedTensorsW8A8StaticTensor)
 
+if should_skip_test_group(group_name="TEST_QUANTIZATION"):
+    pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group",
+                allow_module_level=True)
+
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 6820b2728e3c9..3b7dcbc5983fc 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -7,8 +7,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import ModelConfig
 
+if should_skip_test_group(group_name="TEST_QUANTIZATION"):
+    pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group",
+                allow_module_level=True)
+
 
 @dataclass
 class ModelPair:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fccce7f7b59a7..96bcd8a491d6c 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,9 +5,14 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
+if should_skip_test_group(group_name="TEST_QUANTIZATION"):
+    pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group",
+                allow_module_level=True)
+
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 64f3ce94b7a83..19aab9d08e0cb 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -5,6 +5,12 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index dc2482d85a91f..1fed618b54fa4 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -5,8 +5,13 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 2979470120710..49a7a18502ea6 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,8 +1,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 MODELS = ["facebook/opt-125m"]
 
 
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 233540cdc391f..9dc0d6dfa7bbd 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,10 +1,15 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
 from ..conftest import VllmRunner
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 MODELS = ["facebook/opt-125m"]
 
 
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ed2fee1ae252e..5d79d09e66590 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,7 +1,12 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 MODELS = ["facebook/opt-125m"]
 
 
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 273df509568d6..f7ce4d1d0c694 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -5,9 +5,14 @@
 import torch
 import torch.nn.functional as F
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index f4a5eb621b573..03708e173ea33 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -8,12 +8,17 @@
 import torch
 from transformers import GenerationConfig, GenerationMixin
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import Counter, is_pin_memory_available
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 
 class MockLogitsSampler(Sampler):
 
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 88067f19c8f07..1c91b00b3c6b4 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -8,9 +8,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 from vllm.model_executor.utils import set_random_seed
 
+if should_skip_test_group(group_name="TEST_SAMPLERS"):
+    pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group",
+                allow_module_level=True)
+
 MODEL = "facebook/opt-125m"
 RANDOM_SEEDS = list(range(5))
 
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 81f91c5e10b0d..5600272de9adb 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,9 +1,14 @@
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
 from .conftest import get_output_from_llm_generator
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index 4a2b62151f8cd..14d4e3f33eb7e 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -4,8 +4,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
 from .conftest import run_greedy_equality_correctness_test
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py
index d444ef24cbfda..80cfb7eb7b7d9 100644
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist.py
@@ -5,10 +5,15 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.utils import is_hip
 
 from .conftest import run_greedy_equality_correctness_test
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index c266b4c7ecebd..881e85c70fc3f 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -3,10 +3,15 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
 from .conftest import get_logprobs_from_llm_generator
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.skip("Out of CPU Memory in NM Automation")
 @pytest.mark.parametrize(
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 94d71fb012727..e9814d81d8f8d 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -33,11 +33,16 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import SamplingParams
 
 from .conftest import (get_output_from_llm_generator,
                        run_greedy_equality_correctness_test)
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index d475d37af6425..1dbdc2c82447d 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,8 +26,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
+
 from .conftest import run_greedy_equality_correctness_test
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 43cfd78ddb0cc..0b9ebe4e63556 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,10 +1,15 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 
 from .utils import create_seq_group_metadata_from_prompts, mock_worker
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize('num_target_seq_ids', [100])
 @pytest.mark.skip_global_cleanup
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index bb6d1c23a0039..afb73c8a92a03 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.metrics import AsyncMetricsCollector
@@ -12,7 +13,14 @@
 
 from .utils import create_batch, mock_worker
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
+@pytest.mark.parametrize('queue_size', [4])
+@pytest.mark.parametrize('batch_size', [1])
+@pytest.mark.parametrize('k', [1])
 @pytest.mark.parametrize('queue_size', [4])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('k', [1])
@@ -42,6 +50,12 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
         num_lookahead_slots=k,
         running_queue_size=queue_size)
 
+    if queue_size > disable_by_batch_size:
+        with patch.object(worker,
+                          '_run_no_spec',
+                          side_effect=ValueError(exception_secret)), \
+            pytest.raises(ValueError, match=exception_secret):
+            worker.execute_model(execute_model_req=execute_model_req)
     if queue_size > disable_by_batch_size:
         with patch.object(worker,
                           '_run_no_spec',
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 312878804b86e..d1141d67c38f6 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -4,8 +4,13 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 6cea6668acc91..358aecca926d4 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
@@ -14,6 +15,10 @@
                     create_seq_group_metadata_from_prompts, create_worker,
                     patch_execute_model_with_seeds, zero_kv_cache)
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize('num_steps', list(range(1, 17)))
 def test_assert_enough_kv_space(num_steps: int):
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index b1537884f896e..9ce9213207714 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,11 +1,17 @@
+import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
 from .utils import create_seq_group_metadata_from_prompts, create_worker
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 def test_ngram_algo_correctness_for_single_no_match():
     """Verify our ngram algo find the right candidate in the prompt
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index ef9d32f73d668..4c098246ab1a4 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
@@ -17,6 +18,10 @@
 
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 6b6f35a1a1d05..bdc72346ab011 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -2,9 +2,14 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.sequence import SequenceGroupMetadata
 from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
 
+if should_skip_test_group(group_name="TEST_SPEC_DECODE"):
+    pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group",
+                allow_module_level=True)
+
 
 def test_get_all_seq_ids():
     """Verify get_all_seq_ids extracts all seq ids.
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 70d789e97c12c..39dc67a3f336f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -7,6 +7,7 @@
 import pytest
 import ray
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from tests.utils import ServerRunner
 from vllm import SamplingParams
 # yapf: disable
@@ -20,6 +21,10 @@
 # yapf conflicts with isort for this docstring
 
 
+if should_skip_test_group(group_name="TEST_TENSORIZER_LOADER"):
+    pytest.skip("TEST_TENSORIZER=DISABLE, skipping tensorizer group",
+                allow_module_level=True)
+
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index de79c3b945d4d..5bb3a5c5d65e0 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -72,6 +72,7 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
     queue.join_thread()
 
 
+@pytest.mark.skip("Timeout error in NM automation. Work to re-enable.")
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 4c8238fd8d113..dbd17b88282a0 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,9 +1,15 @@
 from copy import deepcopy
 
+import pytest
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 
+if should_skip_test_group(group_name="TEST_TOKENIZATION"):
+    pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group",
+                allow_module_level=True)
+
 
 def test_cached_tokenizer():
     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 8d019fe5f38ca..a48cfe6fed01f 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -3,11 +3,16 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 
+if should_skip_test_group(group_name="TEST_TOKENIZATION"):
+    pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group",
+                allow_module_level=True)
+
 TRUTH = [
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index 8db7204f15d4e..119fbd2d02e4f 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,8 +1,13 @@
 import pytest
 from transformers import PreTrainedTokenizerBase
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+if should_skip_test_group(group_name="TEST_TOKENIZATION"):
+    pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group",
+                allow_module_level=True)
+
 TOKENIZER_NAMES = [
     "facebook/opt-125m",
     "gpt2",
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 31571dbfff6f6..e18ee99494f65 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -5,6 +5,7 @@
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
     RayTokenizerGroupPool)
@@ -13,6 +14,10 @@
 
 from ..conftest import get_tokenizer_pool_config
 
+if should_skip_test_group(group_name="TEST_TOKENIZATION"):
+    pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group",
+                allow_module_level=True)
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 92de545acd53d..e40de0dc027d8 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -8,6 +9,10 @@
 from vllm.utils import get_open_port
 from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
 
+if should_skip_test_group(group_name="TEST_WORKER"):
+    pytest.skip("TEST_WORKER=DISABLE, skipping worker test group",
+                allow_module_level=True)
+
 
 def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
     engine_args = EngineArgs(model, *args, **kwargs)
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index d941ffdb5588a..cd3807a133cd4 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,10 +1,16 @@
+import pytest
 import torch
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.worker import Worker
 
+if should_skip_test_group(group_name="TEST_WORKER"):
+    pytest.skip("TEST_WORKER=DISABLE, skipping worker test group",
+                allow_module_level=True)
+
 
 def test_swap() -> None:
     # Configure the engine.