From ea450005d1b2248cdf164f09415b1280f0de6b2c Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Mon, 13 Jan 2025 17:31:20 -0500 Subject: [PATCH] Update README to have interactive server scenario + remove unnecessary changes --- language/llama2-70b/README.md | 6 +++--- language/llama2-70b/main.py | 1 - language/llama2-70b/user.conf | 7 ------- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md index 760a5c01bf..5268863ac6 100644 --- a/language/llama2-70b/README.md +++ b/language/llama2-70b/README.md @@ -250,16 +250,16 @@ This was run on a DGX-H100 node. Total runtime was ~4.5 days. For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms` -In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the offline scenario in interactive mode: +In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode: ``` -python -u main.py --scenario Offline \ +python -u main.py --scenario Server \ --model-path ${CHECKPOINT_PATH} \ --mlperf-conf mlperf.conf \ --user-conf user.conf \ --total-sample-count 24576 \ --device cpu \ --dataset-path ${DATASET_PATH} \ - --output-log-dir offline-logs \ + --output-log-dir server-logs \ --lg-model-name llama2-70b-interactive ``` diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py index c5c56c8aea..84ccf849ad 100644 --- a/language/llama2-70b/main.py +++ b/language/llama2-70b/main.py @@ -152,7 +152,6 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario.lower()] # mlperf.conf is automatically loaded by the loadgen - # settings.FromConfig(args.mlperf_conf, args.lg_model_name, args.scenario) settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario) if args.accuracy: diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf index 15fe568957..bb97c437a9 100644 --- a/language/llama2-70b/user.conf +++ b/language/llama2-70b/user.conf @@ -9,10 +9,3 @@ *.Server.target_qps = 0.5 *.Server.min_duration = 120000 *.Server.min_query_count = 100 - -llama2-70b.Server.sample_concatenate_permutation = 1 - -# Target Latencies for low latency setting. -# llama2-70b-interactive.Server.target_latency = 0 -# llama2-70b-interactive.Server.ttft_latency = 450 -# llama2-70b-interactive.Server.tpot_latency = 40 \ No newline at end of file