Merge branch 'main' into rel-eng/dial-in-accuracy-tests

neuralmagic · Jun 20, 2024 · 0dba00a · 0dba00a · github-actions · Jun 20, 2024
2 parents ce0fcdb + abc0ceb
commit 0dba00a
Show file tree

Hide file tree

Showing 133 changed files with 4,486 additions and 1,558 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -48,6 +48,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s spec_decode/e2e/test_integration_dist.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
@@ -7,7 +7,7 @@ steps:
       queue: cpu_queue
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
       DOCKER_BUILDKIT: "1"
@@ -19,6 +19,34 @@ steps:
           limit: 5
   - wait
 
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        soft_fail: true
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: false
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
@@ -31,7 +59,7 @@ steps:
       {% else %}
       queue: gpu_1_queue
       {% endif %}
-    soft_fail: true
+    soft_fail: {{ step.soft_fail or false }}
     {% if step.parallelism %}
     parallelism: {{ step.parallelism }}
     {% endif %}

diff --git a/.github/actions/nm-install-testmo/action.yml b/.github/actions/nm-install-testmo/action.yml
diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
@@ -39,6 +39,9 @@ runs:
       # testmo
       echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
       echo "PROJECT_ID=12" >> $GITHUB_ENV
+      # disable usage stats (writes to protected /usr/local/apps)
+      echo "VLLM_NO_USAGE_STATS=1" >> $GITHUB_ENV
+      echo "DO_NOT_TRACK=1" >> $GITHUB_ENV
     env:
         HF_TOKEN_SECRET: ${{ inputs.hf_token }}
     shell: bash
diff --git a/.github/actions/nm-set-python/action.yml b/.github/actions/nm-set-python/action.yml
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -109,6 +109,10 @@ do
     LOCAL_SUCCESS=0
     RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/")
 
+    # report which test is being run
+    # (in CI, if a test hangs, this logs *which* test is running *before* it hangs)
+    echo "=== RUNNING TEST: ${TEST} ==="
+
     # this is a bit messy and brittle, but certain tests
     # need to be run with specific options
     if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
@@ -125,7 +129,18 @@ do
         pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     fi
 
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+    # if a file gets exit code 0, we are good
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED TEST: ${TEST} ==="
+    # if a file does not run any tests, pytest reports exit code of 5
+    # since we skip full modules in our skipping strategy, this is common
+    elif [[ $LOCAL_SUCCESS == 5 ]]; then
+        echo "=== SKIPPED TEST: ${TEST} ==="
+    # otherwise, report failure
+    else
+        echo "=== FAILED TEST: ${TEST} ==="
+        SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+    fi
 
 done
 

diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml
@@ -98,7 +98,7 @@ jobs:
 
             - name: set python
               id: set_python
-              uses: ./.github/actions/nm-set-python/
+              uses: neuralmagic/nm-actions/actions/set-python@main
               with:
                 python: ${{ inputs.python }}
                 venv: ${{ env.VENV_BASE }}

diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
@@ -27,7 +27,7 @@ jobs:
             test_label_solo: gcp-k8s-l4-solo
             test_label_multi: ignore
             test_timeout: 480
-            test_skip_list: neuralmagic/tests/skip-for-nightly.txt
+            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt

diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml
@@ -94,7 +94,7 @@ jobs:
                 nvcc_threads: 0
 
             - name: install testmo
-              uses: ./.github/actions/nm-install-testmo/
+              uses: neuralmagic/nm-actions/actions/install-testmo@main
 
             - name: create testmo run
               id: create_testmo_run
@@ -131,7 +131,7 @@ jobs:
             - name: run buildkite script
               run: |
                 cd tests && sudo bash ../.buildkite/download-images.sh
-            
+
             - name: setenv test skip
               id: setenv_test_skip
               uses: ./.github/actions/nm-set-env-test-skip

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -25,7 +25,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
     - name: Analysing the code with ruff
       run: |
         ruff .

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/dep.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the
@@ -179,17 +180,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
 
   #
   # The CUTLASS kernels for Hopper require sm90a to be enabled.
   # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
   # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
     set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
           PROPERTIES
           COMPILE_FLAGS
           "-gencode arch=compute_90a,code=sm_90a")
@@ -206,7 +207,8 @@ define_gpu_extension_target(
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
-  WITH_SOABI)
+  WITH_SOABI
+  LIBRARIES cmake_git_version_tracking)
 
 #
 # _moe_C extension
@@ -224,7 +226,8 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   USE_SABI 3
-  WITH_SOABI)
+  WITH_SOABI
+  LIBRARIES cmake_git_version_tracking)
 
 #
 # _punica_C extension
@@ -276,7 +279,8 @@ if (VLLM_PUNICA_GPU_ARCHES)
     COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
     ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
     USE_SABI 3
-    WITH_SOABI)
+    WITH_SOABI
+    LIBRARIES cmake_git_version_tracking)
 else()
   message(WARNING "Unable to create _punica_C target because none of the "
     "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")

diff --git a/Dockerfile b/Dockerfile
@@ -9,8 +9,8 @@
 # prepare basic build environment
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 
-RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git curl sudo
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-cuda.txt
 
 # install development dependencies
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -3,9 +3,13 @@
 FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
+RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
 
@@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
@@ -0,0 +1,19 @@
+ARG NIGHTLY_DATE="20240601"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+
+WORKDIR /workspace
+COPY . /workspace/vllm
+
+ENV VLLM_TARGET_DEVICE="tpu"
+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install the TPU and Pallas dependencies.
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+# Build vLLM.
+RUN cd /workspace/vllm && python setup.py develop
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -68,9 +68,13 @@ async def async_request_tgi(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -189,7 +189,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -346,7 +346,7 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                  scale_b: torch.tensor,
                  out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 
 
 # bench

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
@@ -33,6 +33,7 @@ function (find_isa CPUINFO TARGET OUT)
     endif()
 endfunction()
 
+find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 
 if (AVX512_FOUND)
@@ -53,8 +54,11 @@ if (AVX512_FOUND)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+elseif (AVX2_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
+    message(WARNING "vLLM CPU backend using AVX2 ISA")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

diff --git a/cmake/dep.cmake b/cmake/dep.cmake
@@ -0,0 +1,6 @@
+include(FetchContent)
+FetchContent_Declare(cmake_git_version_tracking                   
+  GIT_REPOSITORY https://github.com/andrew-hardin/cmake-git-version-tracking.git
+  GIT_TAG 6c0cb87edd029ddfb403a8e24577c144a03605a6
+)
+FetchContent_MakeAvailable(cmake_git_version_tracking)
Benchmark suite	Current: `0dba00a`	Previous: `d8da97b`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4834617389958384` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`953.649307774402` tokens/s
Benchmark suite	Current: `0dba00a`	Previous: `d8da97b`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.493410950719893` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`957.4698050764389` tokens/s
Benchmark suite	Current: `0dba00a`	Previous: `d8da97b`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.487254606871566` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`955.1057690386813` tokens/s
Benchmark suite	Current: `0dba00a`	Previous: `d8da97b`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.5014163422745868` prompts/s	`2.4505808553524178` prompts/s	`0.98`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`960.5438754334413` tokens/s	`941.0230484553284` tokens/s	`0.98`