Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
formatted
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed Jul 2, 2024
1 parent 655389d commit 206af82
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ def __init__(
if is_vision_model:
auto_cls = AutoModelForVision2Seq
elif is_compressed_tensors_model:
from llmcompressor.transformers import SparseAutoModelForCausalLM
from llmcompressor.transformers import ( # noqa: E501
SparseAutoModelForCausalLM)
auto_cls = SparseAutoModelForCausalLM
else:
auto_cls = AutoModelForCausalLM
Expand Down
11 changes: 6 additions & 5 deletions tests/models/test_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def test_models(
model_name,
) -> None:
# Run sparseml.
with hf_runner(model_name=model_name,
is_compressed_tensors_model=True) as compressed_tensors_models:
with hf_runner(
model_name=model_name,
is_compressed_tensors_model=True) as compressed_tensors_models:

sparseml_outputs = compressed_tensors_models.generate_greedy_logprobs_limit(
ct_outputs = compressed_tensors_models.generate_greedy_logprobs_limit(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

# Run vllm.
Expand All @@ -47,8 +48,8 @@ def test_models(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

check_logprobs_close(
outputs_0_lst=sparseml_outputs,
outputs_0_lst=ct_outputs,
outputs_1_lst=vllm_outputs,
name_0="sparseml",
name_0="compressed-tensors",
name_1="vllm",
)

2 comments on commit 206af82

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: 206af82 Previous: 569c905 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 194.7061315133821 ms 186.83252736327026 ms 1.04
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 85.79491367311213 ms 85.4025971831417 ms 1.00
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 24.718283303385153 ms 25.236528139927636 ms 0.98
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 6.261794164871115 ms 6.1105002335698435 ms 1.02

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: 206af82 Previous: 569c905 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 186.70196533334015 ms 186.83252736327026 ms 1.00
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 83.50901264231538 ms 85.4025971831417 ms 0.98
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 23.58798709999898 ms 25.236528139927636 ms 0.93
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 5.972017394233661 ms 6.1105002335698435 ms 0.98

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.