formatted

neuralmagic · Jul 2, 2024 · 206af82 · 206af82 · github-actions · Jul 2, 2024
1 parent 655389d
commit 206af82
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 6 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -205,7 +205,8 @@ def __init__(
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
             elif is_compressed_tensors_model:
-                from llmcompressor.transformers import SparseAutoModelForCausalLM
+                from llmcompressor.transformers import (  # noqa: E501
+                    SparseAutoModelForCausalLM)
                 auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM

diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
@@ -35,10 +35,11 @@ def test_models(
     model_name,
 ) -> None:
     # Run sparseml.
-    with hf_runner(model_name=model_name,
-                   is_compressed_tensors_model=True) as compressed_tensors_models:
+    with hf_runner(
+            model_name=model_name,
+            is_compressed_tensors_model=True) as compressed_tensors_models:
 
-        sparseml_outputs = compressed_tensors_models.generate_greedy_logprobs_limit(
+        ct_outputs = compressed_tensors_models.generate_greedy_logprobs_limit(
             example_prompts, MAX_TOKENS, NUM_LOGPROBS)
 
     # Run vllm.
@@ -47,8 +48,8 @@ def test_models(
             example_prompts, MAX_TOKENS, NUM_LOGPROBS)
 
     check_logprobs_close(
-        outputs_0_lst=sparseml_outputs,
+        outputs_0_lst=ct_outputs,
         outputs_1_lst=vllm_outputs,
-        name_0="sparseml",
+        name_0="compressed-tensors",
         name_1="vllm",
     )
Benchmark suite	Current: `206af82`	Previous: `569c905`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`194.7061315133821` ms	`186.83252736327026` ms	`1.04`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`85.79491367311213` ms	`85.4025971831417` ms	`1.00`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.718283303385153` ms	`25.236528139927636` ms	`0.98`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.261794164871115` ms	`6.1105002335698435` ms	`1.02`
Benchmark suite	Current: `206af82`	Previous: `569c905`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`186.70196533334015` ms	`186.83252736327026` ms	`1.00`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`83.50901264231538` ms	`85.4025971831417` ms	`0.98`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`23.58798709999898` ms	`25.236528139927636` ms	`0.93`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`5.972017394233661` ms	`6.1105002335698435` ms	`0.98`