This repository has been archived by the owner on Oct 11, 2024. It is now read-only.
forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Varun Sundar Rabindranath
committed
Mar 27, 2024
1 parent
b548311
commit a41c281
Showing
2 changed files
with
4 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a41c281
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.077347861792705
prompts/s{"name": "input_throughput", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
200.60217186580167
tokens/s{"name": "output_throughput", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
190.47510196495023
tokens/s{"name": "request_throughput", "description": "Benchmark vllm engine throughput - with dataset\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"output-len\": 128,\n \"num-prompts\": 100,\n \"dataset\": \"sharegpt\",\n \"max-model-len\": 4096\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.775327625003007
prompts/s{"name": "token_throughput", "description": "Benchmark vllm engine throughput - with dataset\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"output-len\": 128,\n \"num-prompts\": 100,\n \"dataset\": \"sharegpt\",\n \"max-model-len\": 4096\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3525.5668050051136
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
a41c281
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3263.224603000026
ms{"name": "mean_ttft_ms", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
240.91842699999688
ms{"name": "median_ttft_ms", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
240.87523399998645
ms{"name": "mean_tpot_ms", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.814070076108585
ms{"name": "median_tpot_ms", "description": "Benchmark vllm serving\nmodel - mistralai/Mistral-7B-Instruct-v0.2\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"5,inf\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 4", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.301734848100761
msThis comment was automatically generated by workflow using github-action-benchmark.
a41c281
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.984863119976597
prompts/s3.98509085065037
prompts/s1.00
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1530.187438071013
tokens/s1530.2748866497423
tokens/s1.00
This comment was automatically generated by workflow using github-action-benchmark.