Include interactive mode in llama2 reference + README

mlcommons · Jan 8, 2025 · 4d893e4 · 4d893e4
1 parent 7fa675c
commit 4d893e4
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 3 deletions.
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
@@ -245,3 +245,21 @@ scale from a 0.0-1.0 scale):
 - Tokens per sample: 294.45
 
 This was run on a DGX-H100 node. Total runtime was ~4.5 days.
+
+# Run llama2-70b-interactive benchmark
+
+For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
+
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the offline scenario in interactive mode:
+
+```
+python -u main.py --scenario Offline \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir offline-logs \
+                --lg-model-name llama2-70b-interactive
+```
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
@@ -120,7 +120,13 @@ def get_args():
         default=None,
         help="Specify an api endpoint call to use api mode",
     )
-
+    parser.add_argument(
+        "--lg-model-name",
+        type=str,
+        default="llama2-70b",
+        choices=["llama2-70b", "llama2-70b-interactive"],
+        help="Model name(specified in llm server)",
+    )
     args = parser.parse_args()
     return args
 
@@ -146,8 +152,8 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario)
-    settings.FromConfig(args.user_conf, "llama2-70b", args.scenario)
+    # settings.FromConfig(args.mlperf_conf, args.lg_model_name, args.scenario)
+    settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly