Merge branch 'main' into remote-push-refactor

neuralmagic · Jun 13, 2024 · 4b691b9 · 4b691b9
2 parents 9b2d02f + 5aaec10
commit 4b691b9
Show file tree

Hide file tree

Showing 253 changed files with 10,345 additions and 3,021 deletions.
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Install system packages
+apt update
+apt install -y curl jq
+
+# Install minijinja for templating
+curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
+source $HOME/.cargo/env
+
+# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+
+  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
+    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
+  else
+    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
+    exit 0
+  fi
+fi
+
+# Upload sample.yaml
+buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml
@@ -0,0 +1,39 @@
+steps:
+  # NOTE(simon): You can create separate blocks for different jobs
+  - label: "A100: NVIDIA SMI"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
+          # TODO(simon): check latest main branch or use the PR image.
+          - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+            command:
+            - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  # TODO(simon): bring H100 online
+  # - label: "H100: NVIDIA SMI"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+  #       command:
+  #       - bash -c 'nvidia-smi && nvidia-smi topo -m'
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 
 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /workspace/buildkite-agent ]; then
+if [ ! -f /usr/bin/buildkite-agent ]; then
     exit 0
 fi
 
 # upload the results to buildkite
-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 
 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi
 
 rm ShareGPT_V3_unfiltered_cleaned_split.json
-/workspace/buildkite-agent artifact upload "*.json"
+buildkite-agent artifact upload "*.json"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
+# Run the image
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+  pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
+  cd ../
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -37,6 +37,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
@@ -45,7 +46,8 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]
@@ -62,7 +64,6 @@ steps:
   mirror_hardwares: [amd]
 
   commands:
-  - pytest -v -s test_inputs.py
   - pytest -v -s entrypoints -m llm
   - pytest -v -s entrypoints -m openai
 
@@ -79,6 +80,13 @@ steps:
     - python3 llava_example.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
+- label: Inputs Test
+  #mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -87,14 +95,13 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models --ignore=models/test_llava.py
+    - pytest -v -s models -m \"not llava\"
 
 - label: Llava Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models/test_llava.py
+    - pytest -v -s models -m llava
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
@@ -118,7 +125,10 @@ steps:
 
 - label: Speculative decoding tests
   #mirror_hardwares: [amd]
-  command: pytest -v -s spec_decode
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode
 
 - label: LoRA Test %N
   #mirror_hardwares: [amd]
@@ -130,14 +140,7 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
-    # Temporarily run this way because we cannot clean up GPU mem usage
-    # for multi GPU tests.
-    # TODO(sang): Fix it.
-    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
-    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
-    - pytest -v -s lora/test_long_context.py::test_self_consistency
-    - pytest -v -s lora/test_long_context.py::test_quality
-    - pytest -v -s lora/test_long_context.py::test_max_len
+    - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
@@ -0,0 +1,64 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      {% if step.label == "Documentation Build" %}
+      queue: small_cpu_queue
+      {% elif step.no_gpu %}
+      queue: cpu_queue
+      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+      queue: gpu_4_queue
+      {% else %}
+      queue: gpu_1_queue
+      {% endif %}
+    soft_fail: true
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - docker#v5.2.0:
+          image: {{ docker_image }}
+          always-pull: true
+          propagate-environment: true
+          {% if not step.no_gpu %}
+          gpus: all
+          {% endif %}
+          {% if step.label == "Benchmarks" %}
+          mount-buildkite-agent: true
+          {% endif %}
+          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
+          environment:
+            - VLLM_USAGE_SOURCE=ci-test
+            - HF_TOKEN
+            {% if step.label == "Speculative decoding tests" %}
+            - VLLM_ATTENTION_BACKEND=XFORMERS
+            {% endif %}
+          volumes:
+            - /dev/shm:/dev/shm
+  {% endfor %}
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -4,7 +4,7 @@
 
 steps:
   - label: ":docker: build image"
-    commands: 
+    commands:
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
@@ -28,6 +28,7 @@ steps:
         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
         env:
           DOCKER_BUILDKIT: "1"
+        soft_fail: true
     {% endif %}
     {% endfor %}
 
@@ -36,10 +37,12 @@ steps:
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
-    soft_fail: true
+    soft_fail: false
 
   - label: "Intel Test"
     depends_on: ~
+    agents:
+      queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -113,6 +113,8 @@ do
     # need to be run with specific options
     if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"distributed/test_same_node"* ]]; then
+        VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"distributed"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -37,6 +37,7 @@ jobs:
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/*.py --config-file pyproject.toml
         mypy vllm/transformers_utils --config-file pyproject.toml

diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -27,7 +27,7 @@ on:
       nvcc_threads:
         description: "number of threads nvcc build threads"
         type: string
-        default: "4"
+        default: "8"
       # test related parameters
       test_label_solo:
         description: "requested runner label (specifies instance)"

diff --git a/.github/workflows/nm-upload-assets-to-gcp.yml b/.github/workflows/nm-upload-assets-to-gcp.yml
@@ -58,7 +58,7 @@ jobs:
               with:
                   project_id: ${{ secrets.GCP_PROJECT }}
                   workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
-                  service_account: ${{ secrets.GCP_SA }}
+                  service_account: ${{ secrets.NM_PYPI_SA }}
 
             - name: 'Set up Cloud SDK'
               uses: 'google-github-actions/setup-gcloud@v2'