Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
updated test model logprobs
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed Jun 11, 2024
1 parent 1444822 commit 2df326f
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions tests/models/test_models_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"meta-llama/Llama-2-7b-hf",
"mistralai/Mistral-7B-v0.1",
"Deci/DeciLM-7b",
"tiiuae/falcon-7b",

"gpt2",
"bigcode/tiny_starcoder_py",
"EleutherAI/gpt-j-6b",
Expand All @@ -33,8 +33,9 @@
"bigcode/starcoder2-3b",
]

SKIPPED_MODELS_OOM = [
"EleutherAI/gpt-j-6b",
SKIPPED_MODELS_CI = [
"EleutherAI/gpt-j-6b", # OOM on CPU RAM
"tiiuae/falcon-7b", # Fails in vllm if trust_remote_code=True
]


Expand All @@ -54,23 +55,18 @@ def test_models(
if model in SKIPPED_MODELS_ACC:
pytest.skip(reason="Low priority models not currently passing. "
"We need to re-enable these.")
if model in SKIPPED_MODELS_OOM:
pytest.skip(reason="These models cause OOM issue on the CPU"
"because it is a fp32 checkpoint.")
if model in SKIPPED_MODELS_CI:
pytest.skip(reason="These models cause some CI issue unrelated "
"to the correctness of the implementation.")

hf_model = hf_runner_nm(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts,
max_tokens, num_logprobs)

del hf_model

trust_remote_code = True
# Falcon fails if trust_remote_code = True
# https://github.com/vllm-project/vllm/issues/5363
trust_remote_code = model != "tiiuae/falcon-7b"
vllm_model = vllm_runner_nm(model,
dtype=dtype,
trust_remote_code=trust_remote_code,
max_model_len=MODEL_MAX_LEN)
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
max_tokens,
Expand Down

4 comments on commit 2df326f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 2df326f Previous: ba91187 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 2.430058339880848 prompts/s 2.453237652823585 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 933.1424025142456 tokens/s 942.0432586842567 tokens/s 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 2df326f Previous: ba91187 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 2.4288215879085495 prompts/s
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 932.6674897568829 tokens/s

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 2df326f Previous: ba91187 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 2.432285572106368 prompts/s
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 933.9976596888453 tokens/s

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 2df326f Previous: ba91187 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 2.462970695106252 prompts/s
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 945.7807469208007 tokens/s

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.