From ce8a19bbf7635d56ac03d969f9e6fc6cbc809cad Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:18:19 -0400 Subject: [PATCH] Remote push refactor (#297) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SUMMARY: * updated model test structure to focus on core models * refactored tests to use environment variables (currently at "test group" level - so each folder has an env variable). All tests are off by default and they are explicitly enabled * refactored workflows build-test workflow to use a list of env variables rather than skip test list WHY: * this enables us to be more sane about what is and is not on - as opposed to a long list of files * this enables us to actually track what is run and what is not run (via testmo, which tracks skipped tests) * this enables us to have more fine-grained control over what is run vs not run (we can add more env vars at the sub-group level to turn off more tests) --------- Signed-off-by: kerthcet Signed-off-by: Muralidhar Andoorveedu Signed-off-by: pandyamarut Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: Woosuk Kwon Co-authored-by: Cyrus Leung Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Co-authored-by: Alexey Kondratiev Co-authored-by: Mor Zusman Co-authored-by: Mor Zusman Co-authored-by: Aurick Qiao Co-authored-by: Kuntai Du Co-authored-by: Antoni Baum Co-authored-by: HUANG Fei Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Simon Mo Co-authored-by: Michael Goin Co-authored-by: Kante Yin Co-authored-by: sasha0552 Co-authored-by: SangBin Cho Co-authored-by: Tyler Michael Smith Co-authored-by: Cody Yu Co-authored-by: raywanb <112235519+raywanb@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Philipp Moritz Co-authored-by: Letian Li Co-authored-by: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Co-authored-by: Dipika Sikka Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Elisei Smirnov <61423871+kezouke@users.noreply.github.com> Co-authored-by: Elisei Smirnov Co-authored-by: youkaichao Co-authored-by: leiwen83 Co-authored-by: Lei Wen Co-authored-by: Eric Xihui Lin Co-authored-by: beagleski Co-authored-by: bapatra Co-authored-by: Barun Patra Co-authored-by: Lily Liu Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Zhuohan Li Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: MichaƂ Moskal Co-authored-by: Ruth Evans Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Co-authored-by: Roger Wang Co-authored-by: Junichi Sato Co-authored-by: Marut Pandya Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Co-authored-by: Ronen Schaffer Co-authored-by: Itay Etelis <92247226+Etelis@users.noreply.github.com> Co-authored-by: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Co-authored-by: Alexei V. Ivanov Co-authored-by: Breno Faria Co-authored-by: Breno Faria Co-authored-by: Hyunsung Lee Co-authored-by: Chansung Park Co-authored-by: SnowDist Co-authored-by: functionxu123 <1229853312@qq.com> Co-authored-by: xuhao Co-authored-by: Domenic Barbuzzi --- .../actions/nm-set-env-test-skip/action.yml | 15 ++ .github/workflows/nm-build-test.yml | 30 +-- .github/workflows/nm-nightly.yml | 9 +- .github/workflows/nm-release.yml | 8 +- .github/workflows/nm-remote-push.yml | 98 ++++----- .github/workflows/nm-test.yml | 15 +- .github/workflows/nm-weekly.yml | 2 +- neuralmagic/tests/test_skip_env_vars/full.txt | 19 ++ .../tests/test_skip_env_vars/smoke.txt | 19 ++ requirements-dev.txt | 2 + tests/accuracy/test_lm_eval_correctness.py | 5 + tests/async_engine/test_api_server.py | 6 + tests/async_engine/test_async_llm_engine.py | 5 + tests/async_engine/test_chat_template.py | 5 + tests/async_engine/test_openapi_server_ray.py | 5 + tests/async_engine/test_request_tracker.py | 5 + .../test_basic_correctness.py | 6 + .../test_basic_server_correctness.py | 188 ----------------- .../basic_correctness/test_chunked_prefill.py | 7 + tests/basic_correctness/test_preemption.py | 6 + tests/conftest.py | 14 +- tests/core/block/e2e/test_correctness.py | 5 + .../e2e/test_correctness_sliding_window.py | 5 + tests/core/block/test_block_manager_v2.py | 5 + tests/core/block/test_block_table.py | 5 + tests/core/block/test_common.py | 5 + .../block/test_cpu_gpu_block_allocator.py | 5 + tests/core/block/test_naive_block.py | 5 + tests/core/block/test_prefix_caching_block.py | 5 + tests/core/test_block_manager.py | 5 + tests/core/test_chunked_prefill_scheduler.py | 5 + tests/core/test_scheduler.py | 5 + .../test_basic_distributed_correctness.py | 6 + .../test_chunked_prefill_distributed.py | 6 + tests/distributed/test_comm_ops.py | 5 + tests/distributed/test_custom_all_reduce.py | 5 + tests/distributed/test_pynccl.py | 5 + .../output_processor/test_multi_step.py | 5 + .../output_processor/test_stop_checker.py | 5 + tests/engine/test_computed_prefix_blocks.py | 5 + tests/engine/test_detokenization.py | 5 + tests/engine/test_multiproc_workers.py | 5 + tests/engine/test_skip_tokenizer_init.py | 5 + tests/engine/test_stop_reason.py | 5 + tests/engine/test_stop_strings.py | 5 + tests/entrypoints/openai/test_serving_chat.py | 5 + tests/entrypoints/test_guided_processors.py | 5 + tests/entrypoints/test_llm_encode.py | 5 + tests/entrypoints/test_llm_generate.py | 5 + tests/entrypoints/test_openai_run_batch.py | 7 + tests/entrypoints/test_openai_server.py | 5 + .../test_server_oot_registration.py | 5 + tests/kernels/test_activation.py | 5 + tests/kernels/test_attention.py | 5 + tests/kernels/test_attention_selector.py | 5 + tests/kernels/test_blocksparse_attention.py | 5 + tests/kernels/test_cache.py | 5 + tests/kernels/test_cutlass.py | 5 + tests/kernels/test_flash_attn.py | 6 + tests/kernels/test_int8_quant.py | 5 + tests/kernels/test_layernorm.py | 5 + tests/kernels/test_marlin_gemm.py | 5 + tests/kernels/test_moe.py | 5 + tests/kernels/test_pos_encoding.py | 5 + tests/kernels/test_prefix_prefill.py | 5 + tests/kernels/test_rand.py | 5 + tests/kernels/test_sampler.py | 5 + tests/lora/test_baichuan.py | 7 +- tests/lora/test_chatglm3.py | 7 + tests/lora/test_gemma.py | 5 + tests/lora/test_layer_variation.py | 5 + tests/lora/test_layers.py | 5 + tests/lora/test_llama.py | 5 + tests/lora/test_long_context.py | 5 + tests/lora/test_lora.py | 5 + tests/lora/test_lora_checkpoints.py | 5 + tests/lora/test_lora_manager.py | 5 + tests/lora/test_mixtral.py | 5 + tests/lora/test_phi.py | 7 + tests/lora/test_punica.py | 5 + tests/lora/test_quant_model.py | 5 + tests/lora/test_tokenizer_group.py | 5 + tests/lora/test_utils.py | 5 + tests/lora/test_worker.py | 7 + tests/metrics/test_metrics.py | 5 + tests/model_executor/weight_utils.py | 6 + tests/models/compare_utils.py | 1 + tests/models/test_aqlm.py | 5 + tests/models/test_big_models.py | 6 + tests/models/test_compressed.py | 61 ------ tests/models/test_embedding.py | 6 + tests/models/test_fp8.py | 5 + tests/models/test_gptq_marlin.py | 5 + tests/models/test_gptq_marlin_24.py | 5 + tests/models/test_llava.py | 5 + tests/models/test_marlin.py | 5 + tests/models/test_mistral.py | 6 + tests/models/test_models.py | 6 + tests/models/test_models_logprobs.py | 5 + tests/models/test_oot_registration.py | 6 + tests/models/test_registry.py | 5 + tests/models_core/__init__.py | 0 tests/models_core/test_llm_logprobs.py | 57 +++++ .../test_magic_wand.py} | 51 +++-- tests/models_core/test_server_logprobs.py | 194 ++++++++++++++++++ tests/nm_utils/server.py | 13 -- tests/nm_utils/utils_skip.py | 134 ++++++++++++ .../test_disable_sliding_window.py | 6 + tests/prefix_caching/test_prefix_caching.py | 6 + tests/quantization/test_compressed_tensors.py | 6 + tests/quantization/test_configs.py | 5 + tests/quantization/test_fp8.py | 5 + tests/samplers/test_beam_search.py | 6 + tests/samplers/test_ignore_eos.py | 5 + tests/samplers/test_logits_processor.py | 5 + tests/samplers/test_logprobs.py | 5 + tests/samplers/test_ranks.py | 5 + tests/samplers/test_rejection_sampler.py | 5 + tests/samplers/test_sampler.py | 5 + tests/samplers/test_seeded_generate.py | 5 + tests/spec_decode/e2e/test_compatibility.py | 5 + tests/spec_decode/e2e/test_integration.py | 6 + .../spec_decode/e2e/test_integration_dist.py | 5 + tests/spec_decode/e2e/test_logprobs.py | 5 + .../e2e/test_multistep_correctness.py | 5 + .../spec_decode/e2e/test_ngram_correctness.py | 6 + tests/spec_decode/test_batch_expansion.py | 5 + tests/spec_decode/test_dynamic_spec_decode.py | 14 ++ tests/spec_decode/test_metrics.py | 5 + tests/spec_decode/test_multi_step_worker.py | 5 + tests/spec_decode/test_ngram_worker.py | 6 + tests/spec_decode/test_spec_decode_worker.py | 5 + tests/spec_decode/test_utils.py | 5 + tests/tensorizer_loader/test_tensorizer.py | 5 + tests/test_sharded_state_loader.py | 1 + tests/tokenization/test_cached_tokenizer.py | 6 + tests/tokenization/test_detokenize.py | 5 + tests/tokenization/test_tokenizer.py | 5 + tests/tokenization/test_tokenizer_group.py | 5 + tests/worker/test_model_runner.py | 5 + tests/worker/test_swap.py | 6 + 141 files changed, 1208 insertions(+), 365 deletions(-) create mode 100644 .github/actions/nm-set-env-test-skip/action.yml create mode 100644 neuralmagic/tests/test_skip_env_vars/full.txt create mode 100644 neuralmagic/tests/test_skip_env_vars/smoke.txt delete mode 100644 tests/basic_correctness/test_basic_server_correctness.py delete mode 100644 tests/models/test_compressed.py create mode 100644 tests/models_core/__init__.py create mode 100644 tests/models_core/test_llm_logprobs.py rename tests/{models/test_compressed_memory.py => models_core/test_magic_wand.py} (54%) create mode 100644 tests/models_core/test_server_logprobs.py create mode 100644 tests/nm_utils/utils_skip.py diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml new file mode 100644 index 0000000000000..fb84bc3a6ef9e --- /dev/null +++ b/.github/actions/nm-set-env-test-skip/action.yml @@ -0,0 +1,15 @@ +name: set test skip env vars +description: 'sets env variables for test skipping. See tests/utils_skip.py' +inputs: + test_skip_env_vars: + description: 'file with list of env vars controlling which tests to run.' + required: true + +runs: + using: composite + steps: + - run: | + cat "${ENV_VAR_FILE}" >> $GITHUB_ENV + env: + ENV_VAR_FILE: ${{ inputs.test_skip_env_vars }} + shell: bash diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index aa58273d33648..fd677c9651a60 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -45,8 +45,8 @@ on: description: "git commit hash or branch name" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file with list of env vars controlling which tests to run' type: string required: true # benchmark related parameters @@ -91,22 +91,22 @@ jobs: gitref: ${{ github.ref }} python: ${{ inputs.python }} whl: ${{ needs.BUILD.outputs.whl }} - test_skip_list: ${{ inputs.test_skip_list }} + test_skip_env_vars: ${{ inputs.test_skip_env_vars }} secrets: inherit # TODO: re-enable - TEST-MULTI: - needs: [BUILD] - if: success() # && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) - uses: ./.github/workflows/nm-test.yml - with: - test_label: ${{ inputs.test_label_multi }} - timeout: ${{ inputs.test_timeout }} - gitref: ${{ github.ref }} - python: ${{ inputs.python }} - whl: ${{ needs.BUILD.outputs.whl }} - test_skip_list: ${{ inputs.test_skip_list }} - secrets: inherit + # TEST-MULTI: + # needs: [BUILD] + # if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) + # uses: ./.github/workflows/nm-test.yml + # with: + # test_label: ${{ inputs.test_label_multi }} + # timeout: ${{ inputs.test_timeout }} + # gitref: ${{ github.ref }} + # python: ${{ inputs.python }} + # whl: ${{ needs.BUILD.outputs.whl }} + # test_skip_env_vars: ${{ inputs.test_skip_env_vars }} + # secrets: inherit UPLOAD: needs: [TEST-SOLO] diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 2c5dc38dd6322..fecd49c190c46 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -1,4 +1,4 @@ -name: nm Nightly +name: nm nightly run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }} on: schedule: @@ -45,7 +45,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -63,7 +63,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -81,7 +81,8 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt + benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml index b52b9046d0daf..f5c9056cbc5d7 100644 --- a/.github/workflows/nm-release.yml +++ b/.github/workflows/nm-release.yml @@ -23,7 +23,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -41,7 +41,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -59,7 +59,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -77,7 +77,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index d61618b482f84..3c1fe246756a4 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -12,37 +12,37 @@ concurrency: jobs: -# BUILD-TEST-3-8: -# uses: ./.github/workflows/nm-build-test.yml -# with: -# python: 3.8.17 -# gitref: ${{ github.ref }} -# -# test_label_solo: gcp-k8s-l4-solo -# test_label_multi: ignore -# test_timeout: 480 -# test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt -# -# benchmark_label: gcp-k8s-l4-solo -# benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt -# benchmark_timeout: 480 -# secrets: inherit -# -# BUILD-TEST-3-9: -# uses: ./.github/workflows/nm-build-test.yml -# with: -# python: 3.9.17 -# gitref: ${{ github.ref }} -# -# test_label_solo: gcp-k8s-l4-solo -# test_label_multi: ignore -# test_timeout: 480 -# test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt -# -# benchmark_label: gcp-k8s-l4-solo -# benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt -# benchmark_timeout: 480 -# secrets: inherit + BUILD-TEST-3-8: + uses: ./.github/workflows/nm-build-test.yml + with: + python: 3.8.17 + gitref: ${{ github.ref }} + + test_label_solo: gcp-k8s-l4-solo + test_label_multi: ignore + test_timeout: 480 + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt + + benchmark_label: gcp-k8s-l4-solo + benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_timeout: 480 + secrets: inherit + + BUILD-TEST-3-9: + uses: ./.github/workflows/nm-build-test.yml + with: + python: 3.9.17 + gitref: ${{ github.ref }} + + test_label_solo: gcp-k8s-l4-solo + test_label_multi: ignore + test_timeout: 480 + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt + + benchmark_label: gcp-k8s-l4-solo + benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_timeout: 480 + secrets: inherit BUILD-TEST-3-10: uses: ./.github/workflows/nm-build-test.yml @@ -51,27 +51,27 @@ jobs: gitref: ${{ github.ref }} test_label_solo: gcp-k8s-l4-solo - test_label_multi: gcp-k8s-l4-duo - test_timeout: 1440 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_label_multi: ignore + test_timeout: 480 + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 480 secrets: inherit -# BUILD-TEST-3-11: -# uses: ./.github/workflows/nm-build-test.yml -# with: -# python: 3.11.4 -# gitref: ${{ github.ref }} -# -# test_label_solo: gcp-k8s-l4-solo -# test_label_multi: ignore -# test_timeout: 480 -# test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt -# -# benchmark_label: gcp-k8s-l4-solo -# benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt -# benchmark_timeout: 480 -# secrets: inherit + BUILD-TEST-3-11: + uses: ./.github/workflows/nm-build-test.yml + with: + python: 3.11.4 + gitref: ${{ github.ref }} + + test_label_solo: gcp-k8s-l4-solo + test_label_multi: ignore + test_timeout: 480 + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt + + benchmark_label: gcp-k8s-l4-solo + benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_timeout: 480 + secrets: inherit diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml index 3e3f3adef3ef3..87860bcc356bb 100644 --- a/.github/workflows/nm-test.yml +++ b/.github/workflows/nm-test.yml @@ -23,8 +23,8 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file containing tests env vars for test skipping' type: string required: true @@ -51,8 +51,8 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file containing tests env vars for test skipping' type: string required: true @@ -131,12 +131,17 @@ jobs: - name: run buildkite script run: | cd tests && sudo bash ../.buildkite/download-images.sh + + - name: setenv test skip + id: setenv_test_skip + uses: ./.github/actions/nm-set-env-test-skip + with: + test_skip_env_vars: ${{ inputs.test_skip_env_vars }} - name: run tests id: test uses: ./.github/actions/nm-test-whl/ with: - test_skip_list: ${{ inputs.test_skip_list }} test_directory: tests test_results: test-results diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml index c385e0c6d8510..d92a2619ef359 100644 --- a/.github/workflows/nm-weekly.yml +++ b/.github/workflows/nm-weekly.yml @@ -27,7 +27,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: aws-avx2-192G-4-a10g-96G test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-weekly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: aws-avx2-32G-a10g-24G benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt diff --git a/neuralmagic/tests/test_skip_env_vars/full.txt b/neuralmagic/tests/test_skip_env_vars/full.txt new file mode 100644 index 0000000000000..9c6f69cacd225 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/full.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=DISABLE +TEST_ASYNC_ENGINE=ENABLE +TEST_BASIC_CORRECTNESS=ENABLE +TEST_CORE=ENABLE +TEST_DISTRIBUTED=DISABLE +TEST_ENGINE=ENABLE +TEST_ENTRYPOINTS=ENABLE +TEST_KERNELS=ENABLE +TEST_LORA=ENABLE +TEST_METRICS=ENABLE +TEST_MODELS=ENABLE +TEST_MODELS_CORE=ENABLE +TEST_PREFIX_CACHING=ENABLE +TEST_QUANTIZATION=ENABLE +TEST_SAMPLERS=ENABLE +TEST_SPEC_DECODE=DISABLE +TEST_TENSORIZER_LOADER=ENABLE +TEST_TOKENIZATION=ENABLE +TEST_WORKER=ENABLE diff --git a/neuralmagic/tests/test_skip_env_vars/smoke.txt b/neuralmagic/tests/test_skip_env_vars/smoke.txt new file mode 100644 index 0000000000000..5c5066aaee391 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/smoke.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=DISABLE +TEST_ASYNC_ENGINE=ENABLE +TEST_BASIC_CORRECTNESS=DISABLE +TEST_CORE=ENABLE +TEST_DISTRIBUTED=DISABLE +TEST_ENGINE=ENABLE +TEST_ENTRYPOINTS=DISABLE +TEST_KERNELS=DISABLE +TEST_LORA=DISABLE +TEST_METRICS=ENABLE +TEST_MODELS=DISABLE +TEST_MODELS_CORE=ENABLE +TEST_PREFIX_CACHING=ENABLE +TEST_QUANTIZATION=ENABLE +TEST_SAMPLERS=DISABLE +TEST_SPEC_DECODE=DISABLE +TEST_TENSORIZER_LOADER=DISABLE +TEST_TOKENIZATION=ENABLE +TEST_WORKER=ENABLE diff --git a/requirements-dev.txt b/requirements-dev.txt index 837ed9d495e10..587387a3d582a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,6 +31,8 @@ peft requests==2.31 ray sentence-transformers # required for embedding +optimum # required for hf gptq baselines +auto-gptq # required for hf gptq baselines # Benchmarking aiohttp diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index ded6d98d6f6ad..4c1ac9638a10a 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -8,6 +8,11 @@ import yaml from tests.nm_utils.server import ServerContext +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ACCURACY"): + pytest.skip("TEST_ACCURACY=DISABLE, skipping accuracy test group", + allow_module_level=True) if TYPE_CHECKING: import lm_eval as lm_eval_t diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 7f57d5cf9b182..e2cddf228cce7 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -7,6 +7,12 @@ import pytest import requests +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", + allow_module_level=True) + def _query_server(prompt: str, max_tokens: int = 5) -> dict: response = requests.post("http://localhost:8000/generate", diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 10a46422887e3..77801437e7581 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.async_llm_engine import AsyncLLMEngine +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", + allow_module_level=True) + @dataclass class RequestOutput: diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 55b730812ea94..5e21ed2061a89 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -4,10 +4,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", + allow_module_level=True) + chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( __file__))).parent.parent / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 35d8808b7a699..60c45388c53a8 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,8 +4,13 @@ # and debugging. import ray +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 7b1f4a9e1eb2f..d217db1ba7068 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.async_llm_engine import RequestTracker from vllm.outputs import RequestOutput +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", + allow_module_level=True) + @pytest.mark.asyncio async def test_request_tracker(): diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 805b8883b9d94..fadc4998b4091 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -7,8 +7,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", diff --git a/tests/basic_correctness/test_basic_server_correctness.py b/tests/basic_correctness/test_basic_server_correctness.py deleted file mode 100644 index c33d0aa46c8f2..0000000000000 --- a/tests/basic_correctness/test_basic_server_correctness.py +++ /dev/null @@ -1,188 +0,0 @@ -import asyncio -from os import getenv -from typing import Dict, List, Type - -import openai -import pytest -import torch -from datasets import load_dataset -from openai import AsyncOpenAI -from transformers import AutoTokenizer - -from tests.conftest import HfRunnerNM -from tests.models.compare_utils import check_logprobs_close -from tests.nm_utils.logging import make_logger -from tests.nm_utils.server import ServerContext -from vllm.model_executor.layers.quantization import get_quantization_config - - -@pytest.fixture(scope="session") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client - - -@pytest.fixture -def hf_runner_nm() -> Type[HfRunnerNM]: - return HfRunnerNM - - -async def my_chat( - client, - model: str, - messages: List[Dict], - max_tokens: int, - temperature: float, - num_logprobs: int, -): - """ submit a single prompt chat and collect results. """ - return await client.chat.completions.create(model=model, - messages=messages, - max_tokens=max_tokens, - temperature=temperature, - logprobs=True, - top_logprobs=num_logprobs) - - -@pytest.mark.parametrize("model, max_model_len, sparsity, gptq_config", [ - ("mistralai/Mistral-7B-Instruct-v0.2", 4096, None, None), - ("neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50", 4096, "sparse_w16a16", - None), - ("NousResearch/Llama-2-7b-chat-hf", 4096, None, None), - ("neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat", 4096, - "sparse_w16a16", None), - ("microsoft/phi-2", 2048, None, None), - ("google/gemma-1.1-2b-it", 2056, None, None), - ("HuggingFaceH4/zephyr-7b-gemma-v0.1", 4096, None, None), -]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("tensor_parallel_size", [None]) -# note: repeating the test for 2 values of tensor_parallel_size -# increases the overall execution time by unnecessarily -# collecting the HuggingFace runner data twice. -# Consider refactoring to eliminate that repeat. -def test_models_on_server( - hf_runner_nm: HfRunnerNM, - client: AsyncOpenAI, - model: str, - max_model_len: int, - sparsity: str, - gptq_config: str, - tensor_parallel_size: int, - max_tokens: int, - num_logprobs: int, -) -> None: - """ - This test compares the output of the vllm OpenAI server against that of - a HuggingFace transformer. We expect them to be fairly close. "Close" - is measured by checking that the top 3 logprobs for each token includes - the token of the other inference tool. The first time that there is no - exact match, as long as there is a match to one of the top `num_logprobs` - logprobs, the test will not proceed further, but will pass. - - Parameters to the test identify a model to test, and key arguments - required for that model (see the `max_model_len`, `sparsity` and - `gptq_config` params below). The additional parametrizations expand test - coverage across the functional space of the server. - - :param hf_runner_nm: fixture for the HfRunnerNM - :param client: fixture with an openai.AsyncOpenAI client - :param model: The Hugginface id for a model to test with - :param max_model_len: passed to the vllm Server's --max-model-len option - :param sparsity: passed to the vllm Server's --sparsity option - :param gptq_config: quantization method id for this model. default None - means quantization isn't involved. - :param tensor_parallel_size: passed to the vllm Server's - --tensor_parallel_size option - :param max_tokens: the total number of tokens to consider for closeness - :param num_logprobs: the total number of logprobs included when - calculating closeness - """ - logger = make_logger("vllm_test") - # check that the requested gpu count is available in the test env - gpu_count = torch.cuda.device_count() - if tensor_parallel_size and gpu_count < tensor_parallel_size: - pytest.skip(f"gpu count {gpu_count} is insufficient for " - f"tensor_parallel_size = {tensor_parallel_size}") - - # skip this model if the current device does not have the required - # gpu capability. - device_capability = torch.cuda.get_device_capability() - capability = device_capability[0] * 10 + device_capability[1] - if gptq_config and ( - capability < - get_quantization_config(gptq_config).get_min_capability()): - pytest.skip("insufficient system GPU device capability " - f"({capability}) for this model") - - hf_token = getenv("HF_TOKEN", None) - logger.info("loading chat prompts for testing.") - ds = load_dataset("nm-testing/qa-chat-prompts", split="train_sft") - ds = ds.select(range(20)) - - num_chat_turns = 3 - messages_list = [row["messages"][:num_chat_turns] for row in ds] - tokenizer = AutoTokenizer.from_pretrained(model) - chat_prompts = [ - tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - for messages in messages_list - ] - - logger.info("generating chat responses from HuggingFace runner.") - hf_model = hf_runner_nm(model, access_token=hf_token) - hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( - chat_prompts, max_tokens, num_logprobs, ignore_special_tokens=True) - - del hf_model - - logger.info("generating chat responses from vllm server.") - api_server_args = { - "--model": model, - "--max-model-len": max_model_len, - "--disable-log-requests": None, - } - if sparsity: - api_server_args["--sparsity"] = sparsity - if tensor_parallel_size: - api_server_args["--tensor-parallel-size"] = tensor_parallel_size - - # some devices will require a different `dtype` - if device_capability[0] < 8: - api_server_args["--dtype"] = "half" - - asyncio_event_loop = asyncio.get_event_loop() - temperature = 0.0 - with ServerContext(api_server_args, logger=logger) as _: - # submit an asynchronous request to the server for each prompt - chats = [ - my_chat(client, model, messages, max_tokens, temperature, - num_logprobs) - for messages in [query for query in messages_list] - ] - # await for all the requests to return, and gather their results - # in one place - results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) - - logger.info("preparing results from vllm server requests to include " - "tokens and logprobs.") - vllm_outputs = list() - for task_result in results: - for req_output in task_result.choices: - output_str = req_output.message.content - output_tokens = req_output.logprobs.model_extra["tokens"] - output_logprobs = req_output.logprobs.model_extra["top_logprobs"] - vllm_outputs.append((output_tokens, output_str, output_logprobs)) - - logger.info("comparing HuggingFace and vllm Server chat responses") - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf_model", - name_1="vllm_model", - ) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 357bff61ef019..8cb033edd25a3 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -8,6 +8,13 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 7f20b2d934942..f16c1fb48b833 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -8,10 +8,16 @@ import pytest from prometheus_client import REGISTRY +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT) +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", ] diff --git a/tests/conftest.py b/tests/conftest.py index 48c7f8c095f0e..31c6f12f5c030 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -148,6 +148,7 @@ def __init__( *, is_embedding_model: bool = False, is_vision_model: bool = False, + **kwargs, ) -> None: assert dtype in _STR_DTYPE_TO_TORCH_DTYPE torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -173,6 +174,7 @@ def __init__( model_name, torch_dtype=torch_dtype, trust_remote_code=True, + **kwargs, )) self.tokenizer = AutoTokenizer.from_pretrained( @@ -473,21 +475,21 @@ def _decode_token_by_position_index( def generate_greedy_logprobs_nm_use_tokens( self, - prompts: List[str], + input_ids_lst: List[torch.Tensor], max_tokens: int, topk_logprobs_count: int, - ignore_special_tokens: bool = False ) -> List[Tuple[List[int], str, List[Dict]]]: all_logprobs = [] all_output_tokens = [] all_output_strs = [] - for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + for input_ids in input_ids_lst: output = self.model.generate( - input_ids.cuda(), - use_cache=True, + input_ids, do_sample=False, + temperature=None, # Explicitly set to avoid warning + top_p=None, # Explicitly set to avoid warning + top_k=None, # Explicitly set to avoid warning max_new_tokens=max_tokens, output_hidden_states=True, return_dict_in_generate=True, diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index ad253635e0ba0..604aba39c560d 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -2,10 +2,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_token_ids_from_llm_generator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index d77d6a1dbb741..37e9af1116c9d 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + # relatively small model with 4k sliding window MODEL = "bigcode/starcoder2-3b" BLOCK_SIZE = 16 diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index d0ca09c4be0d4..aa059c45323bf 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,5 +1,6 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager_v2 import BlockSpaceManagerV2 @@ -10,6 +11,10 @@ from ..utils import (create_dummy_prompt, create_seq_group, create_seq_group_encoder_decoder) +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80]) diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 6fb95cfdfab81..2a1c9945b93dc 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.utils import Device, cdiv, chunk_list +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("sequence_len", [1, 16, 129]) diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index cfdd3582ed2ef..957465a2b56dc 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -2,8 +2,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.common import RefCounter +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("seed", list(range(20))) @pytest.mark.parametrize("num_incrs", [1, 100]) diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index 44a5be6c181a0..a70310906e2f1 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.utils import Device, chunk_list +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("num_cpu_blocks", [0, 512]) @pytest.mark.parametrize("num_gpu_blocks", [1024]) diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index edcdc0c7d4f98..4e619ee433f85 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -2,9 +2,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + class TestNaiveBlockAllocator: diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bcf08cda09f46..c300345dd7da6 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -5,10 +5,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.prefix_caching_block import (PrefixCachingBlock, PrefixCachingBlockAllocator) +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + class TestPrefixCachingBlock: diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index cd306b9e4d3cc..17c7f91f01eed 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -4,6 +4,7 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.block import PhysicalTokenBlock from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, @@ -16,6 +17,10 @@ from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + def test_block_allocator_allocate(): block_size = 4 diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 3649e6b003a5d..8fce7b4364298 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -3,6 +3,7 @@ import pytest # noqa +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import CacheConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler @@ -10,6 +11,10 @@ from .utils import create_dummy_prompt +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + def get_sequence_groups(scheduler_output): return [s.seq_group for s in scheduler_output.scheduled_seq_groups] diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 07fc8731e1847..b7960435a6d69 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -5,6 +5,7 @@ import pytest # noqa +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.policy import PolicyFactory @@ -14,6 +15,10 @@ from .utils import create_dummy_prompt +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=DISABLE, skipping core test group", + allow_module_level=True) + def get_sequence_groups(scheduler_output): return [s.seq_group for s in scheduler_output.scheduled_seq_groups] diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index b0576e20e9e1c..aeb4057ebe34e 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -21,6 +21,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", ] diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 204e79e26d513..c8d1147238f52 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -20,6 +20,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", ] diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 2b597bb63c031..894938e874092 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -8,12 +8,17 @@ import ray import torch +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import (init_test_distributed_environment, multi_process_tensor_parallel) from vllm.distributed import (broadcast_tensor_dict, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + @ray.remote(num_gpus=1, max_calls=1) def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index e4bfda8425344..5f77ed7539979 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -6,6 +6,7 @@ import torch import torch.distributed as dist +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import (init_test_distributed_environment, multi_process_tensor_parallel) from vllm.distributed.communication_op import ( # noqa @@ -13,6 +14,10 @@ from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_ca_communicator) +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + random.seed(42) test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] for i, v in enumerate(test_sizes): diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 0218295a3e3f9..b5dca7f8a82fd 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -5,6 +5,7 @@ import torch import torch.distributed +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.distributed.communication_op import ( # noqa graph_capture, tensor_model_parallel_all_reduce) from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator @@ -13,6 +14,10 @@ init_distributed_environment) from vllm.utils import update_environment_variables +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", + allow_module_level=True) + def distributed_run(fn, world_size): number_of_processes = world_size diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 4f32a622546f0..99189af1b0076 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -4,6 +4,7 @@ import pytest from transformers import PreTrainedTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker @@ -15,6 +16,10 @@ from ...core.utils import create_seq_group +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [1, 12]) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index f795403e3d8ad..e6af7a3257d44 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -3,10 +3,15 @@ import pytest from transformers import PreTrainedTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.output_processor.stop_checker import StopChecker from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + def sequence_with_eos(text: str, eos_token: str, eos_token_id: int) -> Sequence: diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index ed35212cc3f11..1f2c7fd14c4cc 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index f77f6d0725b6b..ae1d5779d72c1 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_computed_prefix_blocks(model: str): diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 610ad9732fb91..8b3f68c843cc1 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -6,9 +6,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + class DummyWorker: """Dummy version of vllm.worker.worker.Worker""" diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 338b208723ba9..438ae0fc71477 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_skip_tokenizer_initialization(model: str): diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index b0bd6c4aa95d3..0a95f2cb59e4d 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -9,8 +9,13 @@ import pytest import transformers +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + MODEL = "facebook/opt-350m" STOP_STR = "." SEED = 42 diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 1584b85aeb064..8a68dceac136c 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -2,8 +2,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import CompletionOutput, LLMEngine, SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", + allow_module_level=True) + MODEL = "meta-llama/llama-2-7b-hf" MAX_TOKENS = 200 diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index c45f02fe564a3..e3168f67e001f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + MODEL_NAME = "openai-community/gpt2" CHAT_TEMPLATE = "Dummy chat template for testing {}" diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index fb32a9d155bc0..fe5c709b1db21 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -4,12 +4,17 @@ import torch from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import CompletionRequest from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.model_executor.guided_decoding.outlines_logits_processors import ( JSONLogitsProcessor, RegexLogitsProcessor) +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + TEST_SCHEMA = { "type": "object", "properties": { diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/test_llm_encode.py index 7c3fbe43a8384..12a0a1a269ede 100644 --- a/tests/entrypoints/test_llm_encode.py +++ b/tests/entrypoints/test_llm_encode.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, EmbeddingRequestOutput, PoolingParams +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + from ..conftest import cleanup MODEL_NAME = "intfloat/e5-mistral-7b-instruct" diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index a00fff91a310e..96b47fb5e170b 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, RequestOutput, SamplingParams from ..conftest import cleanup +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + MODEL_NAME = "facebook/opt-125m" PROMPTS = [ diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/test_openai_run_batch.py index 5de28513ca391..6ce7bc08b6cb2 100644 --- a/tests/entrypoints/test_openai_run_batch.py +++ b/tests/entrypoints/test_openai_run_batch.py @@ -2,8 +2,15 @@ import sys import tempfile +import pytest + +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import BatchRequestOutput +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + # ruff: noqa: E501 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index ac895e04dc1f9..d23705e46be70 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -13,9 +13,14 @@ from huggingface_hub import snapshot_download from openai import BadRequestError +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 3e55d7f4297fb..394594fcbf085 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -5,11 +5,16 @@ import torch from openai import OpenAI, OpenAIError +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import ModelRegistry from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.utils import get_open_port +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", + allow_module_level=True) + pytestmark = pytest.mark.openai diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a4b9f91c7688b..320f6e76bb0b8 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -3,11 +3,16 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fa5c951a7fa7a..458226ce38ccd 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,16 @@ from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.utils import get_max_shared_memory_bytes, is_hip from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 79e03c7478de0..79358a2c47c68 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,8 +5,13 @@ from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL, override_backend_env_variable) +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.attention.selector import which_attn_to_use +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + @pytest.mark.parametrize( "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 12109f8767782..8a4b7e62a053b 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -4,6 +4,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) @@ -11,6 +12,10 @@ from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f26eb896105f6..f7aec1cb5b677 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -4,8 +4,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 276ecf00246c7..a9aeeb3a78bf5 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -7,8 +7,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 22772d4ea4422..4437a5ddc8d7a 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -4,6 +4,12 @@ import torch from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + NUM_HEADS = [(16, 16), (32, 8), (64, 8)] HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 0daf7439468aa..679b21fc74606 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -3,6 +3,11 @@ # ruff: noqa: F401 import vllm._C +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index a635e6c12c594..13e4b4febe54b 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,8 +1,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.layernorm import RMSNorm +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 1f8d94bad26d9..1e59513a1ae48 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, @@ -20,6 +21,10 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index d08410a8bdf65..7fad6e53ee754 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,10 +7,15 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + def torch_moe(a, w1, w2, score, topk): B, D = a.shape diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index e564e325112a6..2934959f3d37e 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -4,10 +4,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rotary_embedding import get_rope from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3534468355d9d..630cf77cd4b51 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -7,9 +7,14 @@ from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] HEAD_SIZES = [128, 96, 24] diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index 1e38253937ed5..737467e5f6252 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -3,9 +3,14 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.ops.rand import seeded_uniform from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + @pytest.mark.skip("C compiler not installed in NM automation. " "This codepath follows a triton pathway, which " diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index feb915932bba5..951590a2278e6 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -5,12 +5,17 @@ import triton import triton.language as tl +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.ops.sample import ( MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits, sample) from vllm.model_executor.sampling_metadata import SamplingTensors from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", + allow_module_level=True) + SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 5ab863eea94b3..825f26ad28892 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,10 +1,15 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 @@ -105,4 +110,4 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): del llm_tp4 cleanup() - assert output_tp1 == output_tp4 \ No newline at end of file + assert output_tp1 == output_tp4 diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index bd8cc98ef8ca0..9cee24c90f972 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,6 +1,13 @@ +import pytest + import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "THUDM/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index bf96de026ae09..0c31726dc0fd0 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,8 +1,13 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "google/gemma-7b" diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index ace10e389ae6a..712f822d9bed9 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -7,10 +7,15 @@ from transformers import AutoModelForCausalLM import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" PROMPTS = [ "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 63fd2cd9e7fbb..a3d9203093fef 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -8,6 +8,7 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import LoRAConfig from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, @@ -38,6 +39,10 @@ from .utils import DummyLoRAManager +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + TOLERANCES = { torch.float16: (5e-3, 5e-3), torch.float32: (5e-3, 5e-3), diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 7143a99bea081..ff1d82ba7104f 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -2,10 +2,15 @@ import ray import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "meta-llama/Llama-2-7b-hf" diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index feb58aa28bda4..793e34bf27e19 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -5,6 +5,7 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora from vllm.lora.request import LoRARequest @@ -13,6 +14,10 @@ from .data.long_context_test_data import prompts_and_responses +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + context_len_to_scaling_factor = { "16k": 4, "32k": 8, diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 3415d36b7e341..34c6941140754 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -1,10 +1,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice from .utils import DummyLoRAManager +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] QKV_TENSOR_SIZES = [ (8192, 1024, 1024), diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index d4d1665b624ea..9c9a0fea5cb6c 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.models import LoRAModel from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"] diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c08eee9910149..09a27c90f4768 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -6,6 +6,7 @@ from safetensors.torch import load_file from torch import nn +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -18,6 +19,10 @@ WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 53d49a8dbc813..f7541f271fd98 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -2,8 +2,13 @@ import torch import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a2b42ce4cb96f..13636b9be5140 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,6 +1,13 @@ +import pytest + import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index f021c003b1322..29b4f9c411e1d 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -4,6 +4,11 @@ import torch import vllm.lora.punica as punica +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) def assert_close(a, b): diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index feda06b256e04..278acd2dcdb89 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -6,10 +6,15 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + @dataclass class ModelWithQuantization: diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 2dcad23c2b547..ce72a63016732 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,12 +1,17 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import get_lora_tokenizer from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from ..conftest import get_tokenizer_pool_config +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + @pytest.mark.asyncio @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 4ff9715b4ca8d..39fefc33c4c03 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -3,9 +3,14 @@ import pytest from torch import nn +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule from vllm.utils import LRUCache +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + def test_parse_fine_tuned_lora_name_valid(): fixture = { diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..8c45e15b50d0c 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,12 +3,19 @@ import tempfile from unittest.mock import patch +import pytest + +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) + @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index c1164739eee31..e953f7226b5af 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -3,11 +3,16 @@ import pytest from prometheus_client import REGISTRY +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import EngineArgs, LLMEngine from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_METRICS"): + pytest.skip("TEST_METRICS=DISABLE, skipping metrics test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", ] diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index c8b9bed691bba..4bd0afdc8ca68 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -5,9 +5,15 @@ import pytest from huggingface_hub.utils import LocalEntryNotFoundError +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, enable_hf_transfer) +if should_skip_test_group(group_name="TEST_MODEL_EXECUTOR"): + pytest.skip( + "TEST_MODEL_EXECUTOR=DISABLE, skipping model executor test group", + allow_module_level=True) + def test_hf_transfer_auto_activation(): if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ: diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py index 051cbf1547b21..337428c6c6535 100644 --- a/tests/models/compare_utils.py +++ b/tests/models/compare_utils.py @@ -34,4 +34,5 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): # Break out since sequences will now diverge. # as long as we got this far with the output tokens being the # same, or close, the responses are close enough + print(f"INFO: BREAK IN CHECK LOGPROBS CLOSE AT IDX: {idx}\n\n") break diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index c4ecf846e633c..e24f2632a28d0 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -6,8 +6,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + aqlm_not_supported = True if torch.cuda.is_available(): diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 48b655e58d602..962af078618fa 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -10,6 +10,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1", diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py deleted file mode 100644 index 7bd9ae9409847..0000000000000 --- a/tests/models/test_compressed.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Compare the outputs of a sparse model vs sparse model running dense. -Note: sparse kernels do not have bitwise correctness vs the dense models. -As a result, in this test, we just confirm that the top selected tokens of the -sparse models are in the top N selections of same model running dense. - -Run `pytest tests/models/test_compressed.py`. -""" - -import gc - -import pytest - -from tests.models.utils import check_logprobs_close - -MAX_MODEL_LEN = 1024 -MODEL_FORMAT_PAIRS = [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4", - "semi_structured_sparse_w16a16"), - ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16"), -] - - -@pytest.mark.parametrize("model_format_pairs", MODEL_FORMAT_PAIRS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_format_pairs, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - model_name, sparsity = model_format_pairs - - sparse_model = vllm_runner(model_name=model_name, - sparsity=sparsity, - dtype=dtype, - max_model_len=MAX_MODEL_LEN) - sparse_outputs = sparse_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del sparse_model - - gc.collect() - - dense_model = vllm_runner(model_name=model_name, - sparsity=None, - dtype=dtype, - max_model_len=MAX_MODEL_LEN) - dense_outputs = dense_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del dense_model - - # loop through the prompts - check_logprobs_close( - outputs_0_lst=dense_outputs, - outputs_1_lst=sparse_outputs, - name_0="dense", - name_1="sparse", - ) diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 6556998b68a74..a7e4fc102cdb9 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -6,6 +6,12 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + MODELS = [ "intfloat/e5-mistral-7b-instruct", ] diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 61aee0d0a6e93..c9daed58db6ae 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -8,9 +8,14 @@ import torch from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, SamplingParams from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 1ecd27c5ce51e..978704cdc909c 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -14,9 +14,14 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 195c3e5b5863e..b4159ff0a4968 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -12,8 +12,13 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + marlin_not_supported = True if torch.cuda.is_available(): diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index a1f0cff1cc0e5..9e288b8d854c0 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -3,10 +3,15 @@ import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import VisionLanguageConfig from ..conftest import IMAGE_FILES +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + pytestmark = pytest.mark.llava # The image token is placed before "user" on purpose so that the test can pass diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index a3df2890f307c..debf018ca8a80 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -21,8 +21,13 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + marlin_not_supported = True if torch.cuda.is_available(): diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 88f2e97fb8973..24a0de8d464dd 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -4,8 +4,14 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .utils import check_logprobs_close +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3", diff --git a/tests/models/test_models.py b/tests/models/test_models.py index c838cfcb6913a..d856caec8abdd 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -9,6 +9,12 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "gpt2", diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 4ab78b8fbfe43..621be698a1160 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -5,6 +5,11 @@ import pytest from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 50ab06631500b..fa3f058ed8035 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,9 +1,15 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, ModelRegistry, SamplingParams from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + class MyOPTForCausalLM(OPTForCausalLM): diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 547ab10051f1b..b44e93b9d4fef 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,7 +1,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.models import _MODELS, ModelRegistry +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", + allow_module_level=True) + @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): diff --git a/tests/models_core/__init__.py b/tests/models_core/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py new file mode 100644 index 0000000000000..be776637c87f6 --- /dev/null +++ b/tests/models_core/test_llm_logprobs.py @@ -0,0 +1,57 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +Because of numerical precision and the fact that we are generating +over so many samples, we look + +Run `pytest tests/models/test_models_logprobs.py`. +""" +import pytest + +from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", + allow_module_level=True) + +MODEL_MAX_LEN = 1024 + +MODELS = [ + # Llama (8B param variant) + "meta-llama/Meta-Llama-3-8B-Instruct", + # Qwen2 (7B param variant) + "Qwen/Qwen2-7B-Instruct", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models( + vllm_runner_nm, + hf_runner_nm, + example_prompts, + model: str, + max_tokens: int, + num_logprobs: int, +) -> None: + hf_model = hf_runner_nm(model) + hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts, + max_tokens, num_logprobs) + + del hf_model + + vllm_model = vllm_runner_nm(model, max_model_len=MODEL_MAX_LEN) + vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, + max_tokens, + num_logprobs) + + del vllm_model + + # loop through the prompts + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf_model", + name_1="vllm_model", + ) diff --git a/tests/models/test_compressed_memory.py b/tests/models_core/test_magic_wand.py similarity index 54% rename from tests/models/test_compressed_memory.py rename to tests/models_core/test_magic_wand.py index 5d6392f4a9e45..a24618ec09a4a 100644 --- a/tests/models/test_compressed_memory.py +++ b/tests/models_core/test_magic_wand.py @@ -1,18 +1,21 @@ -"""Checks the memory usage of the sparse model is < memory usage of the -dense model by checking that the number of KV cache blocks is -bigger for the sparse model rather than the dense model. vLLM pre-allocates -the memory for the KV-cache after checking availability once the model -is loaded. This implies that using a compressed model should give more space -for the KV cache and thus more allocated blocks. - -Run `pytest tests/models/test_sparse_memory.py --forked`. -""" +"""Compare the outputs of a sparse model vs sparse model running dense. +Note: sparse kernels do not have bitwise correctness vs the dense models. +As a result, in this test, we just confirm that the top selected tokens of the +sparse models are in the top N selections of same model running dense. -import gc +Run `pytest tests/models/test_compressed.py`. +""" import pytest -import torch +from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", + allow_module_level=True) + +MAX_MODEL_LEN = 1024 MODEL_FORMAT_EXTRABLOCKS = [ ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 1500), ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4", @@ -22,10 +25,15 @@ @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS) @pytest.mark.parametrize("dtype", ["half"]) -def test_models( +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_magic_wand( vllm_runner, + example_prompts, model_format_extrablocks, dtype: str, + max_tokens: int, + num_logprobs: int, ) -> None: model_name, sparsity, num_extra_blocks = model_format_extrablocks dense_model = vllm_runner(model_name=model_name, @@ -36,10 +44,9 @@ def test_models( dense_gpu_alloc = ( dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator) dense_num_kv_blocks = dense_gpu_alloc.num_blocks - + dense_outputs = dense_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) del dense_model - torch.cuda.empty_cache() - gc.collect() sparse_model = vllm_runner( model_name=model_name, @@ -51,12 +58,20 @@ def test_models( sparse_gpu_alloc = ( sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator) sparse_num_kv_blocks = sparse_gpu_alloc.num_blocks - + sparse_outputs = sparse_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) del sparse_model - torch.cuda.empty_cache() - gc.collect() + # Confirm the memory is saved. assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, ( f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " f"expected num_extra_blocks {num_extra_blocks}") + + # Confirm the generations are similar. + check_logprobs_close( + outputs_0_lst=dense_outputs, + outputs_1_lst=sparse_outputs, + name_0="dense", + name_1="sparse", + ) diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py new file mode 100644 index 0000000000000..1477192c0ced7 --- /dev/null +++ b/tests/models_core/test_server_logprobs.py @@ -0,0 +1,194 @@ +import asyncio +import gc +import os +import time +from typing import Dict, List, Type + +import openai +import pytest +import torch +from datasets import load_dataset +from openai import AsyncOpenAI +from transformers import AutoTokenizer + +from tests.conftest import HfRunnerNM +from tests.models.compare_utils import check_logprobs_close +from tests.nm_utils.logging import make_logger +from tests.nm_utils.server import ServerContext +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", + allow_module_level=True) + +# Silence warning. +os.environ["TOKENIZERS_PARALLELISM"] = "True" + +NUM_SAMPLES_TO_RUN = 20 +NUM_CHAT_TURNS = 3 # << Should be an odd number. +REQUEST_RATE = 2.5 +GPU_COUNT = torch.cuda.device_count() +device_capability = torch.cuda.get_device_capability() +DEVICE_CAPABILITY = device_capability[0] * 10 + device_capability[1] + +MODELS = [ + # Llama (8B param variant) + "meta-llama/Meta-Llama-3-8B-Instruct", +] + + +@pytest.fixture(scope="session") +def client(): + client = openai.AsyncOpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", + ) + yield client + + +@pytest.fixture +def hf_runner_nm() -> Type[HfRunnerNM]: + return HfRunnerNM + + +async def my_chat( + client, + model: str, + messages: List[Dict], + max_tokens: int, + num_logprobs: int, +): + """ submit a single prompt chat and collect results. """ + return await client.chat.completions.create(model=model, + messages=messages, + max_tokens=max_tokens, + temperature=0, + logprobs=True, + top_logprobs=num_logprobs) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [3]) +@pytest.mark.parametrize("tensor_parallel_size", [1]) +def test_models_on_server( + hf_runner_nm: HfRunnerNM, + client: AsyncOpenAI, + model: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, +) -> None: + """ + This test compares the output of the vllm OpenAI server against that of + a HuggingFace transformer. We expect them to be fairly close. "Close" + is measured by checking that the top N logprobs for each token includes + the token of the other inference tool. The first time that there is no + exact match, as long as there is a match to one of the top `num_logprobs` + logprobs, the test will not proceed further, but will pass. + + :param hf_runner_nm: fixture for the HfRunnerNM + :param client: fixture with an openai.AsyncOpenAI client + :param model: The Hugginface id for a model to test with + :param max_tokens: the maximum number of tokens to generate + :param num_logprobs: the total number of logprobs checked for "close enough" + :param tensor_parallel_size: passed to the vllm Server launch + """ + logger = make_logger("vllm_test") + + # Check that we have enough GPUs to run the test. + if tensor_parallel_size > 1 and tensor_parallel_size > GPU_COUNT: + pytest.skip(f"gpu count {GPU_COUNT} is insufficient for " + f"tensor_parallel_size = {tensor_parallel_size}") + + # Load dataset. + logger.info("Loading dataset and converting to chat format.") + ds = load_dataset("nm-testing/qa-chat-prompts", + split="train_sft").select(range(NUM_SAMPLES_TO_RUN)) + messages_list = [row["messages"][:NUM_CHAT_TURNS] for row in ds] + tokenizer = AutoTokenizer.from_pretrained(model) + + # Note: its very important to tokenize here due to silliness + # around how the tokenizer works. + # + # The following examples are not equivalent: + # + # ----- + # prompt = tokenizer.apply_chat_template(message) + # ----- + # prompt = tokenizer.apply_chat_template( + # message, tokenize=False) << adds bos + # input_ids = tokenizer(prompt).input_ids << also adds bos + # ----- + input_ids_lst = [ + tokenizer.apply_chat_template(messages, + return_tensors="pt", + add_generation_prompt=True).to("cuda") + for messages in messages_list + ] + + logger.info("Generating chat responses from HF transformers.") + hf_model = hf_runner_nm(model) + hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( + input_ids_lst, max_tokens, num_logprobs) + # Make sure all the memory is cleaned up. + del hf_model + torch.cuda.empty_cache() + gc.collect() + time.sleep(1.0) + + logger.info("Generating chat responses from vLLM server.") + api_server_args = { + "--model": model, + "--max-model-len": 4096, + "--tensor-parallel-size": tensor_parallel_size, + } + + # bfloat16 requires at least Ampere. Set to float16 otherwise. + if DEVICE_CAPABILITY < 80: + api_server_args["--dtype"] = "half" + + # TODO: Update this to work like the benchmark script. + asyncio_event_loop = asyncio.get_event_loop() + with ServerContext(api_server_args, logger=logger) as _: + chats = [] + for messages in messages_list: + chats.append( + my_chat(client, model, messages, max_tokens, num_logprobs)) + # Gather results. + results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) + + logger.info("Processing raw data from vLLM server.") + vllm_outputs = [] + + # See https://platform.openai.com/docs/api-reference/chat/create + for result in results: + req_output = result.choices[0] + output_str = req_output.message.content + + # Unpack from req_output.logprobs.content + # logprobs.content < list of list of token data + # logprobs.content[i].token < sampled token + # logprobs.content[i].top_logprobs < top logprobs + # logprobs.content[i].top_logprobs[j].token + # logprobs.content[i].top_logprobs[j].logprob + + output_tokens = [] + output_logprobs = [] + for token_data in req_output.logprobs.content: + # Actual sampled token. + output_tokens.append(token_data.token) + # Convert TopLogProb --> List[Dict[token, logprob]] + top_logprobs = {} + for top_logprob in token_data.top_logprobs: + top_logprobs[top_logprob.token] = top_logprob.logprob + output_logprobs.append(top_logprobs) + vllm_outputs.append((output_tokens, output_str, output_logprobs)) + + logger.info("Comparing results.") + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf_model", + name_1="vllm_model", + ) diff --git a/tests/nm_utils/server.py b/tests/nm_utils/server.py index 1cff5a42176dc..989e9c053740a 100644 --- a/tests/nm_utils/server.py +++ b/tests/nm_utils/server.py @@ -1,6 +1,5 @@ import logging import os -import shlex import subprocess import sys import time @@ -10,8 +9,6 @@ import requests import torch -from tests.nm_utils.logging import log_banner - MAX_SERVER_START_WAIT = 15 * 60 # time (seconds) to wait for server to start @@ -31,14 +28,6 @@ def __init__(self, *args, ] - if logger: - log_banner( - logger, - "server startup command", - shlex.join(self.startup_command), - logging.DEBUG, - ) - self.proc = subprocess.Popen( [ sys.executable, "-m", "vllm.entrypoints.openai.api_server", @@ -95,8 +84,6 @@ def __init__(self, args: Dict[str, str], *, def __enter__(self): """Executes the server process and waits for it to become ready.""" ray.init(ignore_reinit_error=True) - log_banner(self._logger, "server startup command args", - shlex.join(self._args)) try: self.server_runner = ServerRunner.remote(self._args, diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py new file mode 100644 index 0000000000000..cca1b85d87049 --- /dev/null +++ b/tests/nm_utils/utils_skip.py @@ -0,0 +1,134 @@ +"""Checks environment variables to skip various test groups. +The functions here are imported by each test file. +The .github/actions/nm-test-skipping-env-setup sets these + variables in the testing automation. +""" + +import os + + +def should_skip_accuracy_test_group(): + TEST_ACCURACY = os.getenv("TEST_ACCURACY", "ENABLE") + return TEST_ACCURACY == "DISABLE" + + +def should_skip_async_engine_test_group(): + TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "ENABLE") + return TEST_ASYNC_ENGINE == "DISABLE" + + +def should_skip_basic_correctness_test_group(): + TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "ENABLE") + return TEST_BASIC_CORRECTNESS == "DISABLE" + + +def should_skip_core_test_group(): + TEST_CORE = os.getenv("TEST_CORE", "ENABLE") + return TEST_CORE == "DISABLE" + + +def should_skip_distributed_test_group(): + TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "ENABLE") + return TEST_DISTRIBUTED == "DISABLE" + + +def should_skip_engine_test_group(): + TEST_ENGINE = os.getenv("TEST_ENGINE", "ENABLE") + return TEST_ENGINE == "DISABLE" + + +def should_skip_entrypoints_test_group(): + TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "ENABLE") + return TEST_ENTRYPOINTS == "DISABLE" + + +def should_skip_kernels_test_groups(): + TEST_KERNELS = os.getenv("TEST_KERNELS", "ENABLE") + return TEST_KERNELS == "DISABLE" + + +def should_skip_lora_test_group(): + TEST_LORA = os.getenv("TEST_LORA", "ENABLE") + return TEST_LORA == "DISABLE" + + +def should_skip_metrics_test_group(): + TEST_METRICS = os.getenv("TEST_METRICS", "ENABLE") + return TEST_METRICS == "DISABLE" + + +def should_skip_model_executor_test_group(): + TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "ENABLE") + return TEST_MODEL_EXECUTOR == "DISABLE" + + +def should_skip_models_test_group(): + TEST_MODELS = os.getenv("TEST_MODELS", "ENABLE") + return TEST_MODELS == "DISABLE" + + +def should_skip_models_core_test_group(): + TEST_MODELS_CORE = os.getenv("TEST_MODELS_CORE", "ENABLE") + return TEST_MODELS_CORE == "DISABLE" + + +def should_skip_prefix_caching_test_group(): + TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "ENABLE") + return TEST_PREFIX_CACHING == "DISABLE" + + +def should_skip_quantization_test_group(): + TEST_QUANTIZATION = os.getenv("TEST_QUANTIZATION", "ENABLE") + return TEST_QUANTIZATION == "DISABLE" + + +def should_skip_samplers_test_group(): + TEST_SAMPLERS = os.getenv("TEST_SAMPLERS", "ENABLE") + return TEST_SAMPLERS == "DISABLE" + + +def should_skip_spec_decode_test_group(): + TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "ENABLE") + return TEST_SPEC_DECODE == "DISABLE" + + +def should_skip_tensorizer_loader_test_group(): + TEST_TENSORIZER_LOADER = os.getenv("TEST_TENSORIZER_LOADER", "ENABLE") + return TEST_TENSORIZER_LOADER == "DISABLE" + + +def should_skip_tokenization_test_group(): + TEST_TOKENIZATION = os.getenv("TEST_TOKENIZATION", "ENABLE") + return TEST_TOKENIZATION == "DISABLE" + + +def should_skip_worker_test_group(): + TEST_WORKER = os.getenv("TEST_WORKER", "ENABLE") + return TEST_WORKER == "DISABLE" + + +MAP = { + "TEST_ACCURACY": should_skip_accuracy_test_group, + "TEST_ASYNC_ENGINE": should_skip_async_engine_test_group, + "TEST_BASIC_CORRECTNESS": should_skip_basic_correctness_test_group, + "TEST_CORE": should_skip_core_test_group, + "TEST_DISTRIBUTED": should_skip_distributed_test_group, + "TEST_ENGINE": should_skip_engine_test_group, + "TEST_ENTRYPOINTS": should_skip_entrypoints_test_group, + "TEST_KERNELS": should_skip_kernels_test_groups, + "TEST_LORA": should_skip_lora_test_group, + "TEST_METRICS": should_skip_metrics_test_group, + "TEST_MODELS": should_skip_models_test_group, + "TEST_MODELS_CORE": should_skip_models_core_test_group, + "TEST_PREFIX_CACHING": should_skip_prefix_caching_test_group, + "TEST_QUANTIZATION": should_skip_quantization_test_group, + "TEST_SAMPLERS": should_skip_samplers_test_group, + "TEST_SPEC_DECODE": should_skip_spec_decode_test_group, + "TEST_TENSORIZER_LOADER": should_skip_tensorizer_loader_test_group, + "TEST_TOKENIZATION": should_skip_tokenization_test_group, + "TEST_WORKER": should_skip_worker_test_group, +} + + +def should_skip_test_group(group_name: str) -> bool: + return MAP[group_name]() diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index eeac6ab43c05f..1e2dc9197b403 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -5,8 +5,14 @@ import pytest from tests.conftest import cleanup +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM +if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): + pytest.skip( + "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", + allow_module_level=True) + MODEL_LEN_LEN = [ # Example models with sliding window. ("bigcode/starcoder2-3b", 4096, 16384), diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 305596e16ef1c..7c3be3a1367b2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,9 +4,15 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device +if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): + pytest.skip( + "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_blocks", [16]) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index e6d8218b41372..510175146910d 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -3,13 +3,19 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. """ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", + allow_module_level=True) + def test_compressed_tensors_w8a8_static_setup(vllm_runner): model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 6820b2728e3c9..3b7dcbc5983fc 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -7,8 +7,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import ModelConfig +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", + allow_module_level=True) + @dataclass class ModelPair: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index fccce7f7b59a7..96bcd8a491d6c 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,9 +5,14 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", + allow_module_level=True) + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 64f3ce94b7a83..19aab9d08e0cb 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -5,6 +5,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. # 2. Increase beam_width to 8. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index dc2482d85a91f..1fed618b54fa4 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -5,8 +5,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + # We also test with llama because it has generation_config to specify EOS # (past regression). MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 2979470120710..49a7a18502ea6 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,8 +1,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 233540cdc391f..9dc0d6dfa7bbd 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,10 +1,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from ..conftest import VllmRunner +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index ed2fee1ae252e..5d79d09e66590 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,7 +1,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 273df509568d6..f7ce4d1d0c694 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -5,9 +5,14 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index f4a5eb621b573..03708e173ea33 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -8,12 +8,17 @@ import torch from transformers import GenerationConfig, GenerationMixin +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import Counter, is_pin_memory_available +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + class MockLogitsSampler(Sampler): diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 88067f19c8f07..1c91b00b3c6b4 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -8,9 +8,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", + allow_module_level=True) + MODEL = "facebook/opt-125m" RANDOM_SEEDS = list(range(5)) diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 81f91c5e10b0d..5600272de9adb 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_output_from_llm_generator +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 4a2b62151f8cd..14d4e3f33eb7e 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -4,8 +4,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index d444ef24cbfda..80cfb7eb7b7d9 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -5,10 +5,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.utils import is_hip from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index c266b4c7ecebd..881e85c70fc3f 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_logprobs_from_llm_generator +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.skip("Out of CPU Memory in NM Automation") @pytest.mark.parametrize( diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 94d71fb012727..e9814d81d8f8d 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -33,11 +33,16 @@ import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import (get_output_from_llm_generator, run_greedy_equality_correctness_test) +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index d475d37af6425..1dbdc2c82447d 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,8 +26,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0cc..0b9ebe4e63556 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,10 +1,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from .utils import create_seq_group_metadata_from_prompts, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('num_target_seq_ids', [100]) @pytest.mark.skip_global_cleanup diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index bb6d1c23a0039..afb73c8a92a03 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -3,6 +3,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.metrics import AsyncMetricsCollector @@ -12,7 +13,14 @@ from .utils import create_batch, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + +@pytest.mark.parametrize('queue_size', [4]) +@pytest.mark.parametrize('batch_size', [1]) +@pytest.mark.parametrize('k', [1]) @pytest.mark.parametrize('queue_size', [4]) @pytest.mark.parametrize('batch_size', [1]) @pytest.mark.parametrize('k', [1]) @@ -42,6 +50,12 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int): num_lookahead_slots=k, running_queue_size=queue_size) + if queue_size > disable_by_batch_size: + with patch.object(worker, + '_run_no_spec', + side_effect=ValueError(exception_secret)), \ + pytest.raises(ValueError, match=exception_secret): + worker.execute_model(execute_model_req=execute_model_req) if queue_size > disable_by_batch_size: with patch.object(worker, '_run_no_spec', diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 312878804b86e..d1141d67c38f6 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -4,8 +4,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.spec_decode.metrics import AsyncMetricsCollector +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + def test_initial_call_returns_none(): """Expect first call to get metrics to return None. diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 6cea6668acc91..358aecca926d4 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -4,6 +4,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.utils import set_random_seed from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker @@ -14,6 +15,10 @@ create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('num_steps', list(range(1, 17))) def test_assert_enough_kv_space(num_steps: int): diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index b1537884f896e..9ce9213207714 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,11 +1,17 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer from .utils import create_seq_group_metadata_from_prompts, create_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + def test_ngram_algo_correctness_for_single_no_match(): """Verify our ngram algo find the right candidate in the prompt diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ef9d32f73d668..4c098246ab1a4 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed from vllm.sequence import ExecuteModelRequest, SamplerOutput @@ -17,6 +18,10 @@ from .utils import create_batch, create_sampler_output_list, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 6b6f35a1a1d05..bdc72346ab011 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -2,9 +2,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import SequenceGroupMetadata from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", + allow_module_level=True) + def test_get_all_seq_ids(): """Verify get_all_seq_ids extracts all seq ids. diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 70d789e97c12c..39dc67a3f336f 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -7,6 +7,7 @@ import pytest import ray +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner from vllm import SamplingParams # yapf: disable @@ -20,6 +21,10 @@ # yapf conflicts with isort for this docstring +if should_skip_test_group(group_name="TEST_TENSORIZER_LOADER"): + pytest.skip("TEST_TENSORIZER=DISABLE, skipping tensorizer group", + allow_module_level=True) + prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index de79c3b945d4d..5bb3a5c5d65e0 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -72,6 +72,7 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs): queue.join_thread() +@pytest.mark.skip("Timeout error in NM automation. Work to re-enable.") @pytest.mark.parametrize("enable_lora", [False, True]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 4c8238fd8d113..dbd17b88282a0 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,9 +1,15 @@ from copy import deepcopy +import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer import get_cached_tokenizer +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", + allow_module_level=True) + def test_cached_tokenizer(): reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 8d019fe5f38ca..a48cfe6fed01f 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -3,11 +3,16 @@ import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", + allow_module_level=True) + TRUTH = [ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index 8db7204f15d4e..119fbd2d02e4f 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,8 +1,13 @@ import pytest from transformers import PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", + allow_module_level=True) + TOKENIZER_NAMES = [ "facebook/opt-125m", "gpt2", diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 31571dbfff6f6..e18ee99494f65 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -5,6 +5,7 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( RayTokenizerGroupPool) @@ -13,6 +14,10 @@ from ..conftest import get_tokenizer_pool_config +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", + allow_module_level=True) + @pytest.mark.asyncio @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 92de545acd53d..e40de0dc027d8 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,6 +1,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.distributed.parallel_state import init_distributed_environment from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -8,6 +9,10 @@ from vllm.utils import get_open_port from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size +if should_skip_test_group(group_name="TEST_WORKER"): + pytest.skip("TEST_WORKER=DISABLE, skipping worker test group", + allow_module_level=True) + def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: engine_args = EngineArgs(model, *args, **kwargs) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index d941ffdb5588a..cd3807a133cd4 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,10 +1,16 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import EngineArgs from vllm.sequence import ExecuteModelRequest from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.worker import Worker +if should_skip_test_group(group_name="TEST_WORKER"): + pytest.skip("TEST_WORKER=DISABLE, skipping worker test group", + allow_module_level=True) + def test_swap() -> None: # Configure the engine.