From ecb880167756cb4b36ad70766b8d3254bfb06d26 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 22 Oct 2024 17:58:13 +0100 Subject: [PATCH 1/2] [Postmortem 4.1] Make mlperf.conf static in loadgen, enable automatic Pypi release (#1882) * Extend the generate_final_reports script to output a json file of results * Fix an error in merging perf and power results * Export weight_data_types in the submission checker csv * Fix positional argument * Quote model precision info in submission checker csv * Quote model precision info in submission checker csv * Fix submission generation for v4.0 * Autoload mlperf.conf in loadgen Added VERSION file * Fix loadgen version in R50 test * Fix format for setup.py * Added soft link to mlperf.conf in root * Fix format with clang * Fixed softlink for mlperf.conf * Fixed Python API binding for default argument in FromConfig * Fixed Python API binding for default argument in FromConfig * Fix a bug in FromConfig * Fix deprecation of pkg_resources * Fix deprecation of pkg_resources * Fix deprecation of pkg_resources * Fix package info * Fix package info * Added __init__.py * Fix package info * Restrict the use of only one conf file in loadgen * Remove mlperf.conf arg from the reference implementations * Use custom loadgen version for the github actions (retinanet+bert) * Added version patch * Remove mlperf.conf for vision benchmarks * Fix compilation warnings * formatted by clang * Modify the build_wheels action to use VERSION * Fix build_wheels gh action * Do version increment in github action only for linux * VERSION -> VERSION.txt * Fix the build_wheel github action on windows * VERSION -> VERSION.txt * Support python 3.13 loadgen build * VERSION -> VERSION.txt * VERSION update * Build wheels on macos and windows but upload from only ubuntu * Build wheels on macos and windows but upload from only ubuntu * Build wheels on macos and windows but upload from only ubuntu * Fixes for github action * Fixes for github action * Fixes for github action * Fixes for github action * Make version number consistent for all OS * Make version number consistent for all OS * Do not release pypi wheels from loadgen-dev branch * Split the keys specific to mlperf.conf * Split the keys specific to mlperf.conf * Split the keys specific to mlperf.conf --- .github/workflows/build_wheels.yml | 95 +++++- .github/workflows/test-bert.yml | 2 +- .github/workflows/test-loadgen.yml | 4 +- .github/workflows/test-resnet50.yml | 3 +- .github/workflows/test-retinanet.yml | 2 +- language/bert/run.py | 52 ++- language/gpt-j/main.py | 72 ++-- language/llama2-70b/main.py | 124 +++++-- language/mixtral-8x7b/main.py | 9 +- loadgen/CMakeLists.txt | 20 +- loadgen/MANIFEST.in | 2 + loadgen/VERSION.txt | 1 + loadgen/__init__.py | 0 loadgen/benchmark/repro.cpp | 16 +- loadgen/bindings/python_api.cc | 90 ++--- loadgen/mlperf.conf | 98 ++++++ loadgen/pyproject.toml | 2 +- loadgen/results.h | 33 +- loadgen/setup.py | 42 ++- loadgen/test_settings.h | 11 +- loadgen/test_settings_internal.cc | 166 +++++---- mlperf.conf | 99 +----- recommendation/dlrm_v2/pytorch/python/main.py | 323 ++++++++++++++---- text_to_image/main.py | 51 +-- .../python/main.py | 191 ++++++++--- .../classification_and_detection/run_local.sh | 2 +- 26 files changed, 1040 insertions(+), 470 deletions(-) create mode 100644 loadgen/MANIFEST.in create mode 100644 loadgen/VERSION.txt create mode 100644 loadgen/__init__.py create mode 100644 loadgen/mlperf.conf mode change 100644 => 120000 mlperf.conf diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 114783f19..d7f98764e 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -6,21 +6,69 @@ on: push: branches: - master + - loadgen-release paths: - - loadgen/setup.py + - loadgen/** jobs: + update_version: + name: Update version only on ubuntu but used by windows and macos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + # Step 3: Check if VERSION.txt file has changed in this push + - name: Check if VERSION.txt file has changed + id: version_changed + run: | + if git diff --name-only HEAD~1 | grep -q "VERSION.txt"; then + echo "VERSION.txt file has been modified" + echo "::set-output name=version_changed::true" + new_version=$(cat VERSION.txt) + else + echo "VERSION file has NOT been modified" + echo "::set-output name=version_changed::false" + fi + echo "::set-output name=new_version::$new_version" + + # Step 4: Increment version if VERSION was not changed + - name: Increment version if necessary + id: do_version_increment + if: steps.version_changed.outputs.version_changed == 'false' + run: | + cd loadgen + # Check if VERSION file exists, else initialize it + if [ ! -f VERSION.txt ]; then + echo "0.0.0" > VERSION.txt + fi + + version=$(cat VERSION.txt) + IFS='.' read -r major minor patch <<< "$version" + patch=$((patch + 1)) + new_version="$major.$minor.$patch" + echo $new_version > VERSION.txt + echo "New version: $new_version" + echo "::set-output name=new_version::$new_version" + + # Step 5: Commit the updated version to the repository + - name: Commit updated version + if: steps.version_changed.outputs.version_changed == 'false' + run: | + cd loadgen + git config --global user.name "${{ github.actor }}" + git config --global user.email "${{ github.actor }}@users.noreply.github.com" + git add VERSION.txt + git commit -m "Increment version to ${{ steps.do_version_increment.outputs.new_version }}" + git push + build_wheels: name: Build wheels on ${{ matrix.os }} + needs: update_version runs-on: ${{ matrix.os }} - environment: release - permissions: - # IMPORTANT: this permission is mandatory for trusted publishing - id-token: write strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macOS-latest] + os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v3 @@ -33,6 +81,41 @@ jobs: - name: Build wheels run: python -m cibuildwheel loadgen/ --output-dir wheels + # Save wheels as artifacts + - name: Upload built wheels + uses: actions/upload-artifact@v3 + with: + name: wheels-${{ matrix.os }} + path: wheels + + publish_wheels: + needs: build_wheels # Wait for the build_wheels job to complete + runs-on: ubuntu-latest # Only run this job on Linux + environment: release + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + steps: + - uses: actions/checkout@v3 + + # Download the built wheels from ubuntu + - name: Download Ubuntu wheels + uses: actions/download-artifact@v3 + with: + name: wheels-ubuntu-latest + path: wheels + # Download the built wheels from macOS + - name: Download macOS wheels + uses: actions/download-artifact@v3 + with: + name: wheels-macos-latest + path: wheels + # Download the built wheels from Windows + - name: Download Windows wheels + uses: actions/download-artifact@v3 + with: + name: wheels-windows-latest + path: wheels - name: Publish uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/test-bert.yml b/.github/workflows/test-bert.yml index 6f6d77a39..52ec1dd4d 100755 --- a/.github/workflows/test-bert.yml +++ b/.github/workflows/test-bert.yml @@ -33,4 +33,4 @@ jobs: python3 -m pip install cm4mlops - name: Test BERT and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.loadgen.version=custom diff --git a/.github/workflows/test-loadgen.yml b/.github/workflows/test-loadgen.yml index b010f8258..09436cc02 100755 --- a/.github/workflows/test-loadgen.yml +++ b/.github/workflows/test-loadgen.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 @@ -31,4 +31,4 @@ jobs: python3 -m pip install cm4mlops - name: Test Loadgen run: | - cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} + cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.loadgen.tags=_no-compilation-warnings diff --git a/.github/workflows/test-resnet50.yml b/.github/workflows/test-resnet50.yml index b5d09a66a..f1e3e6c39 100755 --- a/.github/workflows/test-resnet50.yml +++ b/.github/workflows/test-resnet50.yml @@ -8,6 +8,7 @@ on: branches: [ "master", "dev" ] paths: - vision/classification_and_detection/** + - loadgen/** - tools/submission/** - .github/workflows/test-resnet50.yml - '!**.md' @@ -33,4 +34,4 @@ jobs: python3 -m pip install cm4mlops - name: Test Resnet50 and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --adr.loadgen.version=custom diff --git a/.github/workflows/test-retinanet.yml b/.github/workflows/test-retinanet.yml index 20b05cc25..a2f5bf8ec 100755 --- a/.github/workflows/test-retinanet.yml +++ b/.github/workflows/test-retinanet.yml @@ -33,4 +33,4 @@ jobs: python3 -m pip install cm4mlops - name: Test Retinanet and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.loadgen.version=custom diff --git a/language/bert/run.py b/language/bert/run.py index 2afbe56f6..f0b376564 100644 --- a/language/bert/run.py +++ b/language/bert/run.py @@ -15,6 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from absl import flags +from absl import app import subprocess import mlperf_loadgen as lg import argparse @@ -22,13 +24,12 @@ import sys sys.path.insert(0, os.getcwd()) sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "lon")) -from absl import app -from absl import flags + def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--backend", choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"], default="tf", help="Backend") + "--backend", choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"], default="tf", help="Backend") parser.add_argument("--scenario", choices=["SingleStream", "Offline", "Server", "MultiStream"], default="Offline", help="Scenario") parser.add_argument("--accuracy", action="store_true", @@ -37,30 +38,36 @@ def get_args(): help="use quantized model (only valid for onnxruntime backend)") parser.add_argument("--profile", action="store_true", help="enable profiling (only valid for onnxruntime backend)") - parser.add_argument( - "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config") parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs") parser.add_argument("--max_examples", type=int, help="Maximum number of examples to consider (not limited by default)") - parser.add_argument("--network", choices=["sut","lon",None], default=None, help="Loadgen network mode") + parser.add_argument( + "--network", + choices=[ + "sut", + "lon", + None], + default=None, + help="Loadgen network mode") parser.add_argument('--node', type=str, default="") parser.add_argument('--port', type=int, default=8000) - parser.add_argument('--sut_server', nargs="*", default= ['http://localhost:8000'], - help='Address of the server(s) under test.') + parser.add_argument('--sut_server', nargs="*", default=['http://localhost:8000'], + help='Address of the server(s) under test.') args = parser.parse_args() return args scenario_map = { - "SingleStream": lg.TestScenario.SingleStream, - "Offline": lg.TestScenario.Offline, - "Server": lg.TestScenario.Server, - "MultiStream": lg.TestScenario.MultiStream - } + "SingleStream": lg.TestScenario.SingleStream, + "Offline": lg.TestScenario.Offline, + "Server": lg.TestScenario.Server, + "MultiStream": lg.TestScenario.MultiStream +} + def main(): args = get_args() @@ -96,7 +103,8 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario] - settings.FromConfig(args.mlperf_conf, "bert", args.scenario) + # mlperf.conf is automatically loaded by the loadgen + # settings.FromConfig(args.mlperf_conf, "bert", args.scenario) settings.FromConfig(args.user_conf, "bert", args.scenario) if args.accuracy: @@ -117,7 +125,14 @@ def main(): if args.network == "lon": from network_LON import app, set_args, main as app_main - set_args(args, settings, log_settings, args.audit_conf, args.sut_server, args.backend, args.max_examples) + set_args( + args, + settings, + log_settings, + args.audit_conf, + args.sut_server, + args.backend, + args.max_examples) app.run(app_main) elif args.network == "sut": @@ -128,7 +143,12 @@ def main(): else: print("Running LoadGen test...") - lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf) + lg.StartTestWithLogSettings( + sut.sut, + sut.qsl.qsl, + settings, + log_settings, + args.audit_conf) if args.accuracy and not os.environ.get("SKIP_VERIFY_ACCURACY"): cmd = "python3 {:}/accuracy-squad.py {}".format( os.path.dirname(os.path.abspath(__file__)), diff --git a/language/gpt-j/main.py b/language/gpt-j/main.py index bf372e812..d7c244d61 100644 --- a/language/gpt-j/main.py +++ b/language/gpt-j/main.py @@ -9,6 +9,8 @@ from GPTJ_QSL import get_GPTJ_QSL # Function to parse the arguments passed during python file execution + + def get_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -20,7 +22,10 @@ def get_args(): "--dataset-path", default="./data/cnn_eval.json", help="") parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") - parser.add_argument("--dtype", default="float32", help="data type of the model, choose from float16, bfloat16 and float32") + parser.add_argument( + "--dtype", + default="float32", + help="data type of the model, choose from float16, bfloat16 and float32") parser.add_argument("--quantized", action="store_true", help="use quantized model (only valid for onnxruntime backend)") parser.add_argument("--profile", action="store_true", @@ -29,26 +34,34 @@ def get_args(): help="use GPU instead of CPU for the inference") parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs") - parser.add_argument( - "--mlperf_conf", default="mlperf.conf", help="mlperf rules config") parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") parser.add_argument("--max_examples", type=int, default=13368, help="Maximum number of examples to consider (not limited by default)") - parser.add_argument("--network", choices=["sut","lon",None], default=None, help="Loadgen network mode") + parser.add_argument( + "--network", + choices=[ + "sut", + "lon", + None], + default=None, + help="Loadgen network mode") parser.add_argument('--node', type=str, default="") parser.add_argument('--port', type=int, default=8000) - parser.add_argument('--sut_server', nargs="*", default= ['http://localhost:8000'], - help='Address of the server(s) under test.') + parser.add_argument('--sut_server', nargs="*", default=['http://localhost:8000'], + help='Address of the server(s) under test.') args = parser.parse_args() return args # Function to get the amount of temporary cache generated when running the GPT-J model # Varies with the beam size set(Estimate: 6GB x Beam size) + + def get_temp_cache(): beam_size = int(os.environ.get("GPTJ_BEAM_SIZE", "4")) return 6 * beam_size + # Map the loadgen scenario as per the option given by the user scenario_map = { "SingleStream": lg.TestScenario.SingleStream, @@ -58,12 +71,15 @@ def get_temp_cache(): } # Main function triggered when the script is run + + def main(): args = get_args() qsl = None if args.network != "sut": # Gets the Query Data Loader and Query Sample Loader - # Responsible for loading(Query Sample Loader) and sending samples over the network(Query Data Loader) to the server + # Responsible for loading(Query Sample Loader) and sending samples over + # the network(Query Data Loader) to the server qsl = get_GPTJ_QSL( dataset_path=args.dataset_path, max_examples=args.max_examples @@ -72,14 +88,14 @@ def main(): qdl = GPTJ_QDL( sut_server_addr=args.sut_server, scenario=args.scenario, - qsl = qsl + qsl=qsl ) # Initiates and loads loadgen test settings and log path settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario] - # Need to update the conf - settings.FromConfig(args.mlperf_conf, "gptj", args.scenario) + # mlperf conf is automatically loaded by the loadgen + # settings.FromConfig(args.mlperf_conf, "gptj", args.scenario) settings.FromConfig(args.user_conf, "gptj", args.scenario) # Chosing test mode Accutacy/Performance @@ -114,17 +130,24 @@ def main(): network=args.network, dataset_path=args.dataset_path, max_examples=args.max_examples, - qsl=qsl # If args.network is None, then only QSL get passed to the SUT, else it will be None + qsl=qsl # If args.network is None, then only QSL get passed to the SUT, else it will be None ) - + if args.network == "lon" and args.scenario == "SingleStream": print("ERROR: Single stream scenario in Loadgen Over the Network is not supported!") - # If network option is LON, QDL is loaded and request is served to SUT based on the scenario given by user(Offline or SingleStream) + # If network option is LON, QDL is loaded and request is served to SUT + # based on the scenario given by user(Offline or SingleStream) elif args.network == "lon": - lg.StartTestWithLogSettings(qdl.qdl, qsl.qsl, settings, log_settings, args.audit_conf) - - # If network option is SUT, a flask server is initiated, request is processed and output is being sent back to LON client + lg.StartTestWithLogSettings( + qdl.qdl, + qsl.qsl, + settings, + log_settings, + args.audit_conf) + + # If network option is SUT, a flask server is initiated, request is + # processed and output is being sent back to LON client elif args.network == "sut": temp_cache = get_temp_cache() from network_SUT import app, node, set_backend, set_semaphore @@ -138,12 +161,14 @@ def main(): # Acquire and release of semaphore can be found in network_SUT.py model_mem_size = sut.total_mem_size if args.gpu: - free_mem = int(os.environ.get("CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY", get_gpu_memory_info())) / (1024**3) + free_mem = int(os.environ.get( + "CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY", get_gpu_memory_info())) / (1024**3) else: free_mem = get_cpu_memory_info() - lockVar = math.floor((free_mem - model_mem_size)/temp_cache) + lockVar = math.floor((free_mem - model_mem_size) / temp_cache) node = args.node - # Semaphore is set inorder to create multiple instances upon request incomming + # Semaphore is set inorder to create multiple instances upon request + # incomming set_semaphore(lockVar) print(f"Set the semaphore lock variable to {lockVar}") # Pass SUT as the backend @@ -153,7 +178,12 @@ def main(): else: # Test not run in Loadgen Over the Network print("Running LoadGen test...") - lg.StartTestWithLogSettings(sut.sut, qsl.qsl, settings, log_settings, args.audit_conf) + lg.StartTestWithLogSettings( + sut.sut, + qsl.qsl, + settings, + log_settings, + args.audit_conf) print("Test Done!") @@ -167,4 +197,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py index c1af88b95..cc808efb3 100644 --- a/language/llama2-70b/main.py +++ b/language/llama2-70b/main.py @@ -13,37 +13,103 @@ log = logging.getLogger("Llama-70B-MAIN") # function to check the model name in server matches the user specified one + + def verify_model_name(user_specified_name, url): response = requests.get(url) if response.status_code == 200: response_dict = response.json() server_model_name = response_dict["data"][0]["id"] if user_specified_name == server_model_name: - return {"matched":True, "error":False} + return {"matched": True, "error": False} else: - return {"matched":False, "error":f"User specified {user_specified_name} and server model name {server_model_name} mismatch!"} + return {"matched": False, + "error": f"User specified {user_specified_name} and server model name {server_model_name} mismatch!"} else: - return {"matched":False, "error":f"Failed to get a valid response. Status code: {response.status_code}"} + return {"matched": False, + "error": f"Failed to get a valid response. Status code: {response.status_code}"} + def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--scenario", type=str, choices=["Offline", "Server"], default="Offline", help="Scenario") - parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name") + parser.add_argument( + "--scenario", + type=str, + choices=[ + "Offline", + "Server"], + default="Offline", + help="Scenario") + parser.add_argument( + "--model-path", + type=str, + default="meta-llama/Llama-2-70b-chat-hf", + help="Model name") parser.add_argument("--dataset-path", type=str, default=None, help="") - parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode") - parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32") - parser.add_argument("--device", type=str, choices=["cpu", "cuda:0"], default="cpu", help="device to use") - parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs") - parser.add_argument("--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config") - parser.add_argument("--user-conf", type=str, default="user.conf", help="user config for user LoadGen settings such as target QPS") - parser.add_argument("--total-sample-count", type=int, default=24576, help="Number of samples to use in benchmark.") # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it - parser.add_argument("--batch-size", type=int, default=1, help="Model batch-size to use in benchmark.") - parser.add_argument("--output-log-dir", type=str, default="output-logs", help="Where logs are saved") - parser.add_argument("--enable-log-trace", action="store_true", help="Enable log tracing. This file can become quite large") - parser.add_argument("--num-workers", type=int, default=1, help="Number of workers to process queries") + parser.add_argument( + "--accuracy", + action="store_true", + help="Run accuracy mode") + parser.add_argument( + "--dtype", + type=str, + default="float32", + help="data type of the model, choose from float16, bfloat16 and float32") + parser.add_argument( + "--device", + type=str, + choices=[ + "cpu", + "cuda:0"], + default="cpu", + help="device to use") + parser.add_argument( + "--audit-conf", + type=str, + default="audit.conf", + help="audit config for LoadGen settings during compliance runs") + parser.add_argument( + "--user-conf", + type=str, + default="user.conf", + help="user config for user LoadGen settings such as target QPS") + # TODO: This interpretation of 'total-sample-count' is a little + # misleading. Fix it + parser.add_argument( + "--total-sample-count", + type=int, + default=24576, + help="Number of samples to use in benchmark.") + parser.add_argument( + "--batch-size", + type=int, + default=1, + help="Model batch-size to use in benchmark.") + parser.add_argument( + "--output-log-dir", + type=str, + default="output-logs", + help="Where logs are saved") + parser.add_argument( + "--enable-log-trace", + action="store_true", + help="Enable log tracing. This file can become quite large") + parser.add_argument( + "--num-workers", + type=int, + default=1, + help="Number of workers to process queries") parser.add_argument("--vllm", action="store_true", help="vllm mode") - parser.add_argument("--api-model-name", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name(specified in llm server)") - parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode") + parser.add_argument( + "--api-model-name", + type=str, + default="meta-llama/Llama-2-70b-chat-hf", + help="Model name(specified in llm server)") + parser.add_argument( + "--api-server", + type=str, + default=None, + help="Specify an api endpoint call to use api mode") args = parser.parse_args() return args @@ -52,13 +118,16 @@ def get_args(): scenario_map = { "offline": lg.TestScenario.Offline, "server": lg.TestScenario.Server, - } +} + def main(): args = get_args() - + if args.vllm: - resp = verify_model_name(args.api_model_name, args.api_server+"/v1/models") + resp = verify_model_name( + args.api_model_name, + args.api_server + "/v1/models") if resp["error"]: print(f"\n\n\033[91mError:\033[0m", end=" ") print(resp["error"]) @@ -66,8 +135,8 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario.lower()] - # Need to update the conf - settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario) + # mlperf.conf is automatically loaded by the loadgen + # settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario) settings.FromConfig(args.user_conf, "llama2-70b", args.scenario) if args.accuracy: @@ -91,7 +160,7 @@ def main(): sut_map = { "offline": SUT, "server": SUTServer - } + } sut_cls = sut_map[args.scenario.lower()] @@ -122,7 +191,12 @@ def main(): sut.start() lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries) log.info("Starting Benchmark run") - lg.StartTestWithLogSettings(lgSUT, sut.qsl, settings, log_settings, args.audit_conf) + lg.StartTestWithLogSettings( + lgSUT, + sut.qsl, + settings, + log_settings, + args.audit_conf) # Stop sut after completion sut.stop() diff --git a/language/mixtral-8x7b/main.py b/language/mixtral-8x7b/main.py index c28cdc536..3f2cd61b7 100644 --- a/language/mixtral-8x7b/main.py +++ b/language/mixtral-8x7b/main.py @@ -54,11 +54,6 @@ def get_args(): type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs") - parser.add_argument( - "--mlperf-conf", - type=str, - default="mlperf.conf", - help="mlperf rules config") parser.add_argument( "--user-conf", type=str, @@ -111,8 +106,8 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario.lower()] - # Need to update the conf - settings.FromConfig(args.mlperf_conf, "mixtral-8x7b", args.scenario) + # mlperf_conf is automatically loaded by the loadgen + # settings.FromConfig(args.mlperf_conf, "mixtral-8x7b", args.scenario) settings.FromConfig(args.user_conf, "mixtral-8x7b", args.scenario) if args.accuracy: diff --git a/loadgen/CMakeLists.txt b/loadgen/CMakeLists.txt index 35420e806..66a68f6ab 100644 --- a/loadgen/CMakeLists.txt +++ b/loadgen/CMakeLists.txt @@ -1,10 +1,17 @@ -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(mlperf_loadgen) -# The mlperf_loadgen version. -set(mlperf_loadgen_VERSION_MAJOR 4) -set(mlperf_loadgen_VERSION_MINOR 1) +# Read the version file +file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS) + +# Extract the major and minor versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format) +string(REGEX MATCH "^([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS}) + +# Set the variables for the major and minor versions +set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}") +set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}") + message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}") # Set build options. NB: CXX_STANDARD is supported since CMake 3.1. @@ -73,3 +80,8 @@ install(TARGETS mlperf_loadgen DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h") +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mlperf.conf + DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + +# Define preprocessor macro with the path to mlperf.conf +add_definitions(-DMLPERF_CONF_PATH=\"${CMAKE_INSTALL_PREFIX}/include/mlperf.conf\") diff --git a/loadgen/MANIFEST.in b/loadgen/MANIFEST.in new file mode 100644 index 000000000..1cdd95c6d --- /dev/null +++ b/loadgen/MANIFEST.in @@ -0,0 +1,2 @@ +include mlperf.conf +include VERSION.txt diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt new file mode 100644 index 000000000..9edf2a44f --- /dev/null +++ b/loadgen/VERSION.txt @@ -0,0 +1 @@ +4.1.7 diff --git a/loadgen/__init__.py b/loadgen/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/loadgen/benchmark/repro.cpp b/loadgen/benchmark/repro.cpp index 1112ef961..44ff53efa 100644 --- a/loadgen/benchmark/repro.cpp +++ b/loadgen/benchmark/repro.cpp @@ -33,10 +33,10 @@ class QSL : public mlperf::QuerySampleLibrary { const std::string& Name() override { return mName; } size_t TotalSampleCount() override { return 1000000; } size_t PerformanceSampleCount() override { return TotalSampleCount(); } - void LoadSamplesToRam( - const std::vector& samples) override {} + void LoadSamplesToRam(const std::vector&) override { + } void UnloadSamplesFromRam( - const std::vector& samples) override {} + const std::vector&) override {} private: std::string mName{"Dummy QSL"}; @@ -51,14 +51,14 @@ class BasicSUT : public mlperf::SystemUnderTest { ~BasicSUT() override {} const std::string& Name() override { return mName; } void IssueQuery(const std::vector& samples) override { - int n = samples.size(); + size_t n = samples.size(); if (n > mResponses.size()) { std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe " "you should initResponse with larger value!?" << std::endl; initResponse(samples.size()); } - for (int i = 0; i < n; i++) { + for (size_t i = 0; i < n; i++) { mResponses[i].id = samples[i].id; } mlperf::QuerySamplesComplete(mResponses.data(), n); @@ -122,7 +122,7 @@ class QueueSUT : public mlperf::SystemUnderTest { } actualSize = std::min(maxSize, mIdQueue.size()); - for (int i = 0; i < actualSize; i++) { + for (size_t i = 0; i < actualSize; i++) { responses[i].id = mIdQueue.front(); mIdQueue.pop_front(); } @@ -166,7 +166,7 @@ class MultiBasicSUT : public mlperf::SystemUnderTest { const std::string& Name() override { return mName; } void IssueQuery(const std::vector& samples) override { int thread_idx = mThreadMap[std::this_thread::get_id()]; - int n = samples.size(); + size_t n = samples.size(); auto& reponses = mResponses[thread_idx]; if (n > reponses.size()) { std::cout @@ -175,7 +175,7 @@ class MultiBasicSUT : public mlperf::SystemUnderTest { << std::endl; initResponse(samples.size()); } - for (int i = 0; i < n; i++) { + for (size_t i = 0; i < n; i++) { reponses[i].id = samples[i].id; } mlperf::QuerySamplesComplete(reponses.data(), n); diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index f3155987a..7f50f5f56 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -19,9 +19,9 @@ limitations under the License. #include #include "../loadgen.h" +#include "../query_dispatch_library.h" #include "../query_sample.h" #include "../query_sample_library.h" -#include "../query_dispatch_library.h" #include "../system_under_test.h" #include "../test_settings.h" #include "pybind11/functional.h" @@ -133,34 +133,36 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary { // A QDL that allows defining callbacks for // IssueQuery, FlushQueries, and Name methods. class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary { - public: - QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback, + public: + QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback, FlushQueriesCallback flush_queries_callback, NameCallback name_callback) - : issue_query_callback_(issue_query_callback), - flush_queries_callback_(flush_queries_callback), - name_callback_(name_callback) {} - - // Returns the name of the SUT. Name shall be returned over the network - // TODO: other bindings should also be fixed eventually to be used over the network - const std::string& Name() override { - static std::string name; // HACK: avoid returning a reference to temporary. - pybind11::gil_scoped_acquire gil_acquirer; - name = name_callback_(); // name_callback_() shall returned name over the network. - return name; - } + : issue_query_callback_(issue_query_callback), + flush_queries_callback_(flush_queries_callback), + name_callback_(name_callback) {} + + // Returns the name of the SUT. Name shall be returned over the network + // TODO: other bindings should also be fixed eventually to be used over the + // network + const std::string& Name() override { + static std::string name; // HACK: avoid returning a reference to temporary. + pybind11::gil_scoped_acquire gil_acquirer; + name = name_callback_(); // name_callback_() shall returned name over the + // network. + return name; + } - void IssueQuery(const std::vector& samples) override { - pybind11::gil_scoped_acquire gil_acquirer; - issue_query_callback_(samples); - } + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + issue_query_callback_(samples); + } - void FlushQueries() override { flush_queries_callback_(); } + void FlushQueries() override { flush_queries_callback_(); } - protected: - IssueQueryCallback issue_query_callback_; - FlushQueriesCallback flush_queries_callback_; - NameCallback name_callback_; + protected: + IssueQueryCallback issue_query_callback_; + FlushQueriesCallback flush_queries_callback_; + NameCallback name_callback_; }; } // namespace @@ -213,8 +215,8 @@ void DestroyQSL(uintptr_t qsl) { uintptr_t ConstructQDL(IssueQueryCallback issue_cb, FlushQueriesCallback flush_queries_cb, NameCallback name_callback) { - QueryDispatchLibraryTrampoline* qdl = - new QueryDispatchLibraryTrampoline(issue_cb, flush_queries_cb, name_callback); + QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline( + issue_cb, flush_queries_cb, name_callback); return reinterpret_cast(qdl); } @@ -223,7 +225,7 @@ void DestroyQDL(uintptr_t qdl) { reinterpret_cast(qdl); delete qdl_cast; } - + void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, const std::string& audit_config_filename) { pybind11::gil_scoped_release gil_releaser; @@ -259,7 +261,7 @@ void QuerySamplesComplete(std::vector responses, } void FirstTokenComplete(std::vector responses, - ResponseCallback response_cb = {}) { + ResponseCallback response_cb = {}) { pybind11::gil_scoped_release gil_releaser; mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb); } @@ -331,18 +333,26 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::performance_issue_same_index) .def_readwrite("performance_sample_count_override", &TestSettings::performance_sample_count_override) - .def_readwrite("test05", - &TestSettings::test05) + .def_readwrite("test05", &TestSettings::test05) .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed) .def_readwrite("test05_sample_index_rng_seed", &TestSettings::test05_sample_index_rng_seed) - .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed) + .def_readwrite("test05_schedule_rng_seed", + &TestSettings::test05_schedule_rng_seed) .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) - .def_readwrite("infer_token_latencies", &TestSettings::infer_token_latencies) - .def_readwrite("token_latency_scaling_factor", &TestSettings::token_latency_scaling_factor) - .def("FromConfig", &TestSettings::FromConfig, "FromConfig."); + .def_readwrite("infer_token_latencies", + &TestSettings::infer_token_latencies) + .def_readwrite("token_latency_scaling_factor", + &TestSettings::token_latency_scaling_factor) + .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"), + pybind11::arg("model"), pybind11::arg("scenario"), + pybind11::arg("is_mlperf_conf") = false, + "This function configures settings from the given user " + "configuration file, model, and scenario. The is_mlperf_conf flag " + "should be set to false or else only the default mlperf_conf file " + "will be loaded by the loadgen."); pybind11::enum_(m, "LoggingMode") .value("AsyncPoll", LoggingMode::AsyncPoll) @@ -410,10 +420,9 @@ PYBIND11_MODULE(mlperf_loadgen, m) { q.id = t[0].cast(); q.data = t[1].cast(); q.size = t[2].cast(); - if (t.size() == 4){ + if (t.size() == 4) { q.n_tokens = t[3].cast(); - } - else{ + } else { q.n_tokens = 0; } return q; @@ -438,10 +447,11 @@ PYBIND11_MODULE(mlperf_loadgen, m) { m.def("DestroyQSL", &py::DestroyQSL, "Destroy the object created by ConstructQSL."); - m.def("ConstructQDL", &py::ConstructQDL, - "Construct the query sample library, communicating with the SUT over the network."); + m.def("ConstructQDL", &py::ConstructQDL, + "Construct the query sample library, communicating with the SUT over " + "the network."); m.def("DestroyQDL", &py::DestroyQDL, - "Destroy the object created by ConstructQDL."); + "Destroy the object created by ConstructQDL."); m.def("StartTest", &py::StartTest, "Run tests on a SUT created by ConstructSUT() with the provided QSL. " diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf new file mode 100644 index 000000000..10f7ae78f --- /dev/null +++ b/loadgen/mlperf.conf @@ -0,0 +1,98 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# Set performance_sample_count for each model. +# User can optionally set this to higher values in user.conf. +resnet50.*.performance_sample_count_override = 1024 +ssd-mobilenet.*.performance_sample_count_override = 256 +retinanet.*.performance_sample_count_override = 64 +bert.*.performance_sample_count_override = 10833 +dlrm.*.performance_sample_count_override = 204800 +dlrm-v2.*.performance_sample_count_override = 204800 +rnnt.*.performance_sample_count_override = 2513 +gptj.*.performance_sample_count_override = 13368 +llama2-70b.*.performance_sample_count_override = 24576 +stable-diffusion-xl.*.performance_sample_count_override = 5000 +# set to 0 to let entire sample set to be performance sample +3d-unet.*.performance_sample_count_override = 0 + +# Set seeds. The seeds will be distributed two weeks before the submission. +*.*.qsl_rng_seed = 3066443479025735752 +*.*.sample_index_rng_seed = 10688027786191513374 +*.*.schedule_rng_seed = 14962580496156340209 +# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. +*.*.test05_qsl_rng_seed = 16799458546791641818 +*.*.test05_sample_index_rng_seed = 5453809927556429288 +*.*.test05_schedule_rng_seed = 5435552105434836064 + + +*.SingleStream.target_latency_percentile = 90 +*.SingleStream.min_duration = 600000 + +*.MultiStream.target_latency_percentile = 99 +*.MultiStream.samples_per_query = 8 +*.MultiStream.min_duration = 600000 +*.MultiStream.min_query_count = 662 +retinanet.MultiStream.target_latency = 528 + +# 3D-UNet uses equal issue mode because it has non-uniform inputs +3d-unet.*.sample_concatenate_permutation = 1 + +# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario +gptj.*.sample_concatenate_permutation = 1 +llama2-70b.*.sample_concatenate_permutation = 1 +mixtral-8x7b.*.sample_concatenate_permutation = 1 + +*.Server.target_latency = 10 +*.Server.target_latency_percentile = 99 +*.Server.target_duration = 0 +*.Server.min_duration = 600000 +resnet50.Server.target_latency = 15 +retinanet.Server.target_latency = 100 +bert.Server.target_latency = 130 +dlrm.Server.target_latency = 60 +dlrm-v2.Server.target_latency = 60 +rnnt.Server.target_latency = 1000 +gptj.Server.target_latency = 20000 +stable-diffusion-xl.Server.target_latency = 20000 +# Llama2-70b benchmarks measures token latencies +llama2-70b.*.use_token_latencies = 1 +mixtral-8x7b.*.use_token_latencies = 1 +# gptj benchmark infers token latencies +gptj.*.infer_token_latencies = 1 +gptj.*.token_latency_scaling_factor = 69 +# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0 +llama2-70b.Server.target_latency = 0 +llama2-70b.Server.ttft_latency = 2000 +llama2-70b.Server.tpot_latency = 200 + +mixtral-8x7b.Server.target_latency = 0 +mixtral-8x7b.Server.ttft_latency = 2000 +mixtral-8x7b.Server.tpot_latency = 200 + +*.Offline.target_latency_percentile = 90 +*.Offline.min_duration = 600000 + +# In Offline scenario, we always have one query. But LoadGen maps this to +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use +# the dataset size as the limit + +resnet50.Offline.min_query_count = 24576 +retinanet.Offline.min_query_count = 24576 +dlrm-v2.Offline.min_query_count = 24576 +bert.Offline.min_query_count = 10833 +gptj.Offline.min_query_count = 13368 +rnnt.Offline.min_query_count = 2513 +3d-unet.Offline.min_query_count = 43 +stable-diffusion-xl.Offline.min_query_count = 5000 +llama2-70b.Offline.min_query_count = 24576 +mixtral-8x7b.Offline.min_query_count = 15000 + +# These fields should be defined and overridden by user.conf. +*.SingleStream.target_latency = 10 +*.MultiStream.target_latency = 80 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 diff --git a/loadgen/pyproject.toml b/loadgen/pyproject.toml index 9be24eea6..6f0ae06f0 100755 --- a/loadgen/pyproject.toml +++ b/loadgen/pyproject.toml @@ -4,4 +4,4 @@ build-backend = "setuptools.build_meta:__legacy__" [tool.cibuildwheel] environment = "CFLAGS='-std=c++14'" -build = "cp3{7,8,9,10,11,12}-*" +build = "cp3{7,8,9,10,11,12,13}-*" diff --git a/loadgen/results.h b/loadgen/results.h index 38bbe32d4..6befea2c0 100644 --- a/loadgen/results.h +++ b/loadgen/results.h @@ -16,12 +16,12 @@ limitations under the License. #ifndef MLPERF_LOADGEN_RESULTS_H_ #define MLPERF_LOADGEN_RESULTS_H_ +#include +#include + #include "query_sample.h" #include "test_settings_internal.h" -#include -#include - namespace mlperf { namespace loadgen { @@ -46,7 +46,6 @@ struct PerformanceResult { TokenPerformanceResults token_results; }; - /// \brief Wraps PerformanceResult with relevant context to change how /// it's interpreted and reported. struct PerformanceSummary { @@ -84,20 +83,21 @@ struct PerformanceSummary { // Set by ProcessTokenLatencies size_t token_count = 0; size_t overlatency_first_token_count = 0; - QuerySampleLatency first_token_latency_min; - QuerySampleLatency first_token_latency_max; - QuerySampleLatency first_token_latency_mean; - QuerySampleLatency time_per_output_token_min; - QuerySampleLatency time_per_output_token_max; - QuerySampleLatency time_per_output_token_mean; + QuerySampleLatency first_token_latency_min = 0; + QuerySampleLatency first_token_latency_max = 0; + QuerySampleLatency first_token_latency_mean = 0; + QuerySampleLatency time_per_output_token_min = 0; + QuerySampleLatency time_per_output_token_max = 0; + QuerySampleLatency time_per_output_token_mean = 0; // Latency token target percentile - PercentileEntry token_target_latency_percentile{settings.target_latency_percentile}; + PercentileEntry token_target_latency_percentile{ + settings.target_latency_percentile}; PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95}, {.97}, {.99}, {.999}}; PercentileEntry target_tpot_percentile{settings.target_latency_percentile}; PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95}, - {.97}, {.99}, {.999}}; + {.97}, {.99}, {.999}}; #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) // MSVC complains if there is no explicit constructor. @@ -111,10 +111,10 @@ struct PerformanceSummary { void ProcessTokenLatencies(); bool MinDurationMet(std::string* recommendation); - bool EarlyStopping(std::string* recommendation, int64_t queries_issued, - std::vector* sample_latencies, - std::vector* query_latencies, - std::chrono::nanoseconds target_latency); + bool EarlyStopping(std::string* recommendation, int64_t queries_issued, + std::vector* sample_latencies, + std::vector* query_latencies, + std::chrono::nanoseconds target_latency); bool MinQueriesMet(); bool MinSamplesMet(); bool HasPerfConstraints(); @@ -125,5 +125,4 @@ struct PerformanceSummary { } // namespace loadgen } // namespace mlperf - #endif diff --git a/loadgen/setup.py b/loadgen/setup.py index 8dfc5b9f0..1e8cb0a8d 100644 --- a/loadgen/setup.py +++ b/loadgen/setup.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================= -## \file +# \file # \brief MLPerf Inference LoadGen python module setup. # \details Creates a module that python can import. # All source files are compiled by python"s C++ toolchain without depending @@ -24,12 +24,11 @@ # and binaries. Use one of the gn build targets instead if you want # to avoid poluting the source tree. -from setuptools import Extension -from setuptools import setup -from version_generator import generate_loadgen_version_definitions +from setuptools import Extension, setup from pathlib import Path from pybind11 import get_include from pybind11.setup_helpers import Pybind11Extension, build_ext +from version_generator import generate_loadgen_version_definitions generated_version_source_filename = "generated/version_generated.cc" generate_loadgen_version_definitions(generated_version_source_filename, ".") @@ -77,21 +76,42 @@ mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings mlperf_loadgen_sources = (mlperf_loadgen_sources_no_gen + [generated_version_source_filename]) -mlperf_long_description = (this_directory / "README.md").read_text(encoding="utf-8") +mlperf_long_description = ( + this_directory / + "README.md").read_text( + encoding="utf-8") + +config_file_path = Path(__file__).parent / "mlperf.conf" + +with open("VERSION.txt", "r") as f: + version = f.read() +version_split = version.split(".") + +if len(version_split) < 2: + print("Version is incomplete. Needs a format like 4.1 in VERSION file") mlperf_loadgen_module = Pybind11Extension( - "mlperf_loadgen", - define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "1")], - include_dirs=[".", get_include()], - sources=mlperf_loadgen_sources, - depends=mlperf_loadgen_headers) + "mlperf_loadgen", + define_macros=[ + ("MAJOR_VERSION", + version_split[0]), + ("MINOR_VERSION", + version_split[1]), + ("MLPERF_CONF_PATH", + f'"{config_file_path}"')], + include_dirs=[".", get_include()], + sources=mlperf_loadgen_sources, + depends=mlperf_loadgen_headers) setup(name="mlcommons_loadgen", - version="4.1", + version=version, description="MLPerf Inference LoadGen python bindings", url="https://mlcommons.org/", cmdclass={"build_ext": build_ext}, ext_modules=[mlperf_loadgen_module], + packages=[''], + package_data={'': ['mlperf.conf']}, + include_package_data=True, long_description=mlperf_long_description, long_description_content_type='text/markdown') diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 8b209035c..739b2947f 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -180,10 +180,10 @@ struct TestSettings { double offline_expected_qps = 1; /// \brief Affects the order in which the samples of the dataset are chosen. /// If false it concatenates a single permutation of the dataset (or part - /// of it depending on QSL->PerformanceSampleCount()) several times up to the + /// of it depending on QSL->PerformanceSampleCount()) several times up to the /// number of samples requested. - /// If true it concatenates a multiple permutation of the dataset (or a - /// part of it depending on QSL->PerformanceSampleCount()) several times + /// If true it concatenates a multiple permutation of the dataset (or a + /// part of it depending on QSL->PerformanceSampleCount()) several times /// up to the number of samples requested. bool sample_concatenate_permutation = false; /**@}*/ @@ -228,7 +228,8 @@ struct TestSettings { uint64_t accuracy_log_sampling_target = 0; /// \brief Variables for running test05 from native config. A boolean that - /// determines whether or not to run test05 and three random seed to run the test + /// determines whether or not to run test05 and three random seed to run the + /// test bool test05 = false; uint64_t test05_qsl_rng_seed = 0; uint64_t test05_sample_index_rng_seed = 0; @@ -236,7 +237,7 @@ struct TestSettings { /// \brief Load mlperf parameter config from file. int FromConfig(const std::string &path, const std::string &model, - const std::string &scenario); + const std::string &scenario, bool is_mlperf_conf = false); /**@}*/ // ================================== diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index ce1b9c4be..3dd49c25d 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -53,7 +53,7 @@ TestSettingsInternal::TestSettingsInternal( server_ttft_latency(requested.server_ttft_latency), server_tpot_latency(requested.server_tpot_latency), infer_token_latencies(requested.infer_token_latencies), - token_latency_scaling_factor(requested.token_latency_scaling_factor){ + token_latency_scaling_factor(requested.token_latency_scaling_factor) { // Target QPS, target latency, and max_async_queries. switch (requested.scenario) { case TestScenario::SingleStream: @@ -158,14 +158,15 @@ TestSettingsInternal::TestSettingsInternal( // performance_sample_count == 0 makes it to be equal to loaded_samples.size() if (sample_concatenate_permutation && requested.scenario == TestScenario::SingleStream) { - // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90% latency + // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90% + // latency constexpr double kSlack = 2.0; - uint64_t expected_queries = kSlack * DurationToSeconds(target_duration) * target_qps; - min_query_count = min_query_count > expected_queries - ? min_query_count - : expected_queries; - min_query_count += - qsl_performance_sample_count - (min_query_count % qsl_performance_sample_count); + uint64_t expected_queries = + kSlack * DurationToSeconds(target_duration) * target_qps; + min_query_count = + min_query_count > expected_queries ? min_query_count : expected_queries; + min_query_count += qsl_performance_sample_count - + (min_query_count % qsl_performance_sample_count); } min_sample_count = min_query_count * samples_per_query; @@ -335,13 +336,16 @@ void LogRequestedTestSettings(const TestSettings &s) { MLPERF_LOG(detail, "requested_performance_sample_count_override", s.performance_sample_count_override); MLPERF_LOG(detail, "requested_sample_concatenate_permutation", - s.sample_concatenate_permutation); + s.sample_concatenate_permutation); // Token latencies specific values - if (s.use_token_latencies){ - MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies); - if (s.scenario != TestScenario::Offline){ - MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency); - MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency); + if (s.use_token_latencies) { + MLPERF_LOG(detail, "requested_use_token_latencies", + s.use_token_latencies); + if (s.scenario != TestScenario::Offline) { + MLPERF_LOG(detail, "requested_server_ttft_latency", + s.server_ttft_latency); + MLPERF_LOG(detail, "requested_server_tpot_latency", + s.server_tpot_latency); } } #else @@ -488,7 +492,7 @@ void TestSettingsInternal::LogAllSettings() const { void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { summary("samples_per_query : ", samples_per_query); summary("target_qps : ", target_qps); - if (!use_token_latencies){ + if (!use_token_latencies) { summary("target_latency (ns): ", target_latency.count()); } else { summary("ttft_latency (ns): ", server_ttft_latency); @@ -514,13 +518,28 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { } // namespace loadgen -/// \todo The TestSettings::FromConfig definition belongs in a test_settings.cc -/// file which doesn't yet exist. To avoid churn so close to the submission -/// deadline, adding a test_settings.cc file has been deferred to v0.6. int TestSettings::FromConfig(const std::string &path, const std::string &model, - const std::string &scenario) { - // TODO: move this method to a new file test_settings.cc + const std::string &scenario, bool is_mlperf_conf) { std::map kv; + static int configCount = 0; + + if (!is_mlperf_conf) { + if (configCount == 0) { + // Only allow userConf as the single configFile and loadgen loads the + // mlperfConf automatically + FromConfig(MLPERF_CONF_PATH, model, scenario, true); + } + + else { + LogDetail([](AsyncDetail &detail) { + std::stringstream ss; + ss << "Multiple conf files are used. This is not valid for official " + "submission."; + MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str()); + }); + } + configCount++; + } // lookup key/value pairs from config auto lookupkv = [&](const std::string &model, const std::string &scenario, @@ -662,66 +681,57 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, break; } } - lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr); - lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr); - lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr); - lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr); - lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr); - lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed, - nullptr); - lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr); - lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed, - nullptr); - lookupkv(model, scenario, "accuracy_log_probability", nullptr, - &accuracy_log_probability, 0.01); - lookupkv(model, scenario, "accuracy_log_sampling_target", - &accuracy_log_sampling_target, nullptr); - if (lookupkv(model, scenario, "print_timestamps", &val, nullptr)) - print_timestamps = (val == 0) ? false : true; - if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr)) - performance_issue_unique = (val == 0) ? false : true; - if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr)) - performance_issue_same = (val == 0) ? false : true; - lookupkv(model, scenario, "performance_issue_same_index", - &performance_issue_same_index, nullptr); - lookupkv(model, scenario, "performance_sample_count_override", - &performance_sample_count_override, nullptr); - if (lookupkv(model, scenario, "sample_concatenate_permutation", &val, nullptr)) - sample_concatenate_permutation = (val == 1) ? true : false; - if (lookupkv(model, scenario, "test05", &val, nullptr)) - test05 = (val == 1) ? true : false; - lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed, nullptr); - lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed, - nullptr); - lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr); - - // keys to measure token metrics - if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)){ + if (is_mlperf_conf) { + lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr); + lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed, + nullptr); + lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr); + lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed, + nullptr); + lookupkv(model, scenario, "accuracy_log_probability", nullptr, + &accuracy_log_probability, 0.01); + lookupkv(model, scenario, "accuracy_log_sampling_target", + &accuracy_log_sampling_target, nullptr); + if (lookupkv(model, scenario, "sample_concatenate_permutation", &val, + nullptr)) + sample_concatenate_permutation = (val == 1) ? true : false; + if (lookupkv(model, scenario, "test05", &val, nullptr)) + test05 = (val == 1) ? true : false; + lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed, + nullptr); + lookupkv(model, scenario, "test05_sample_index_rng_seed", + &test05_sample_index_rng_seed, nullptr); + lookupkv(model, scenario, "test05_schedule_rng_seed", + &test05_schedule_rng_seed, nullptr); + } + + // keys that can be overriden in user.conf but will make the results eligibale + // only for open submission keys to measure token metrics + if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) { use_token_latencies = (val == 1) ? true : false; } - if (use_token_latencies){ - lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000); - lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000); + if (use_token_latencies) { + lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, + 1000 * 1000); + lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, + 1000 * 1000); } // keys to infer token metrics - if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)){ + if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)) { infer_token_latencies = (val == 1) ? true : false; } - if (infer_token_latencies){ - lookupkv(model, scenario, "token_latency_scaling_factor", &token_latency_scaling_factor, nullptr, 1); + if (infer_token_latencies) { + lookupkv(model, scenario, "token_latency_scaling_factor", + &token_latency_scaling_factor, nullptr, 1); } // keys that apply to SingleStream lookupkv(model, "SingleStream", "target_latency_percentile", nullptr, &single_stream_target_latency_percentile, 0.01); - lookupkv(model, "SingleStream", "target_latency", nullptr, - &single_stream_expected_latency_ns, 1000 * 1000); // keys that apply to MultiStream lookupkv(model, "MultiStream", "target_latency_percentile", nullptr, &multi_stream_target_latency_percentile, 0.01); - lookupkv(model, "MultiStream", "target_latency", nullptr, - &multi_stream_expected_latency_ns, 1000 * 1000); lookupkv(model, "MultiStream", "samples_per_query", &multi_stream_samples_per_query, nullptr, 1); @@ -730,15 +740,37 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, &server_target_latency_percentile, 0.01); lookupkv(model, "Server", "target_latency", &server_target_latency_ns, nullptr, 1000 * 1000); - lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps); + + // keys that can be overriden in user.conf (the provided values still need to + // pass the submission checker rules) + if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr)) + performance_issue_unique = (val == 0) ? false : true; + if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr)) + performance_issue_same = (val == 0) ? false : true; + lookupkv(model, scenario, "performance_issue_same_index", + &performance_issue_same_index, nullptr); + if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr)) server_coalesce_queries = (val == 0) ? false : true; if (lookupkv(model, "Server", "max_async_queries", &val, nullptr)) server_max_async_queries = int(val); - // keys that apply to Offline + lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr); + lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr); + lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr); + lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr); + lookupkv(model, scenario, "performance_sample_count_override", + &performance_sample_count_override, nullptr); + lookupkv(model, "SingleStream", "target_latency", nullptr, + &single_stream_expected_latency_ns, 1000 * 1000); + lookupkv(model, "MultiStream", "target_latency", nullptr, + &multi_stream_expected_latency_ns, 1000 * 1000); + lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps); lookupkv(model, "Offline", "target_qps", 0, &offline_expected_qps); + if (lookupkv(model, scenario, "print_timestamps", &val, nullptr)) + print_timestamps = (val == 0) ? false : true; + return 0; } diff --git a/mlperf.conf b/mlperf.conf deleted file mode 100644 index 10f7ae78f..000000000 --- a/mlperf.conf +++ /dev/null @@ -1,98 +0,0 @@ -# The format of this config file is 'key = value'. -# The key has the format 'model.scenario.key'. Value is mostly int64_t. -# Model maybe '*' as wildcard. In that case the value applies to all models. -# All times are in milli seconds - -# Set performance_sample_count for each model. -# User can optionally set this to higher values in user.conf. -resnet50.*.performance_sample_count_override = 1024 -ssd-mobilenet.*.performance_sample_count_override = 256 -retinanet.*.performance_sample_count_override = 64 -bert.*.performance_sample_count_override = 10833 -dlrm.*.performance_sample_count_override = 204800 -dlrm-v2.*.performance_sample_count_override = 204800 -rnnt.*.performance_sample_count_override = 2513 -gptj.*.performance_sample_count_override = 13368 -llama2-70b.*.performance_sample_count_override = 24576 -stable-diffusion-xl.*.performance_sample_count_override = 5000 -# set to 0 to let entire sample set to be performance sample -3d-unet.*.performance_sample_count_override = 0 - -# Set seeds. The seeds will be distributed two weeks before the submission. -*.*.qsl_rng_seed = 3066443479025735752 -*.*.sample_index_rng_seed = 10688027786191513374 -*.*.schedule_rng_seed = 14962580496156340209 -# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. -*.*.test05_qsl_rng_seed = 16799458546791641818 -*.*.test05_sample_index_rng_seed = 5453809927556429288 -*.*.test05_schedule_rng_seed = 5435552105434836064 - - -*.SingleStream.target_latency_percentile = 90 -*.SingleStream.min_duration = 600000 - -*.MultiStream.target_latency_percentile = 99 -*.MultiStream.samples_per_query = 8 -*.MultiStream.min_duration = 600000 -*.MultiStream.min_query_count = 662 -retinanet.MultiStream.target_latency = 528 - -# 3D-UNet uses equal issue mode because it has non-uniform inputs -3d-unet.*.sample_concatenate_permutation = 1 - -# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario -gptj.*.sample_concatenate_permutation = 1 -llama2-70b.*.sample_concatenate_permutation = 1 -mixtral-8x7b.*.sample_concatenate_permutation = 1 - -*.Server.target_latency = 10 -*.Server.target_latency_percentile = 99 -*.Server.target_duration = 0 -*.Server.min_duration = 600000 -resnet50.Server.target_latency = 15 -retinanet.Server.target_latency = 100 -bert.Server.target_latency = 130 -dlrm.Server.target_latency = 60 -dlrm-v2.Server.target_latency = 60 -rnnt.Server.target_latency = 1000 -gptj.Server.target_latency = 20000 -stable-diffusion-xl.Server.target_latency = 20000 -# Llama2-70b benchmarks measures token latencies -llama2-70b.*.use_token_latencies = 1 -mixtral-8x7b.*.use_token_latencies = 1 -# gptj benchmark infers token latencies -gptj.*.infer_token_latencies = 1 -gptj.*.token_latency_scaling_factor = 69 -# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0 -llama2-70b.Server.target_latency = 0 -llama2-70b.Server.ttft_latency = 2000 -llama2-70b.Server.tpot_latency = 200 - -mixtral-8x7b.Server.target_latency = 0 -mixtral-8x7b.Server.ttft_latency = 2000 -mixtral-8x7b.Server.tpot_latency = 200 - -*.Offline.target_latency_percentile = 90 -*.Offline.min_duration = 600000 - -# In Offline scenario, we always have one query. But LoadGen maps this to -# min_sample_count internally in Offline scenario. If the dataset size is larger -# than 24576 we limit the min_query_count to 24576 and otherwise we use -# the dataset size as the limit - -resnet50.Offline.min_query_count = 24576 -retinanet.Offline.min_query_count = 24576 -dlrm-v2.Offline.min_query_count = 24576 -bert.Offline.min_query_count = 10833 -gptj.Offline.min_query_count = 13368 -rnnt.Offline.min_query_count = 2513 -3d-unet.Offline.min_query_count = 43 -stable-diffusion-xl.Offline.min_query_count = 5000 -llama2-70b.Offline.min_query_count = 24576 -mixtral-8x7b.Offline.min_query_count = 15000 - -# These fields should be defined and overridden by user.conf. -*.SingleStream.target_latency = 10 -*.MultiStream.target_latency = 80 -*.Server.target_qps = 1.0 -*.Offline.target_qps = 1.0 diff --git a/mlperf.conf b/mlperf.conf new file mode 120000 index 000000000..58fc80e84 --- /dev/null +++ b/mlperf.conf @@ -0,0 +1 @@ +loadgen/mlperf.conf \ No newline at end of file diff --git a/recommendation/dlrm_v2/pytorch/python/main.py b/recommendation/dlrm_v2/pytorch/python/main.py index 53b8d9237..d931c661a 100755 --- a/recommendation/dlrm_v2/pytorch/python/main.py +++ b/recommendation/dlrm_v2/pytorch/python/main.py @@ -36,13 +36,13 @@ SUPPORTED_DATASETS = { "debug": (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(), - {"randomize": 'total', "memory_map": True}), + {"randomize": 'total', "memory_map": True}), "multihot-criteo-sample": (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(), - {"randomize": 'total', "memory_map": True}), + {"randomize": 'total', "memory_map": True}), "multihot-criteo": (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(), - {"randomize": 'total', "memory_map": True}), + {"randomize": 'total', "memory_map": True}), } # pre-defined command line options so simplify things. They are used as defaults and can be @@ -97,42 +97,104 @@ def get_args(): """Parse commandline.""" parser = argparse.ArgumentParser() parser.add_argument("--model", help="name of the mlperf model, ie. dlrm") - parser.add_argument("--model-path", required=True, help="path to the model file") - parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") - parser.add_argument("--dataset-path", required=True, help="path to the dataset") - parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles") + parser.add_argument( + "--model-path", + required=True, + help="path to the model file") + parser.add_argument( + "--dataset", + choices=SUPPORTED_DATASETS.keys(), + help="dataset") + parser.add_argument( + "--dataset-path", + required=True, + help="path to the dataset") + parser.add_argument( + "--profile", + choices=SUPPORTED_PROFILES.keys(), + help="standard profiles") parser.add_argument("--scenario", default="SingleStream", help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys()))) parser.add_argument("--max-ind-range", type=int, default=-1) - parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference") + parser.add_argument( + "--max-batchsize", + type=int, + help="max batch size in a single inference") parser.add_argument("--output", help="test results") parser.add_argument("--inputs", help="model inputs (currently not used)") parser.add_argument("--outputs", help="model outputs (currently not used)") parser.add_argument("--backend", help="runtime to use") parser.add_argument("--use-gpu", action="store_true", default=False) - parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads") - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") - parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass") + parser.add_argument( + "--threads", + default=os.cpu_count(), + type=int, + help="threads") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") + parser.add_argument( + "--find-peak-performance", + action="store_true", + help="enable finding peak performance pass") - # file to use mlperf rules compliant parameters - parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config") # file for user LoadGen settings such as target QPS - parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") - - # below will override mlperf rules compliant settings - don't use for official submission - parser.add_argument("--duration", type=int, help="duration in milliseconds (ms)") + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS") + + # below will override mlperf rules compliant settings - don't use for + # official submission + parser.add_argument( + "--duration", + type=int, + help="duration in milliseconds (ms)") parser.add_argument("--target-qps", type=int, help="target/expected qps") - parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") - parser.add_argument("--count-samples", type=int, help="dataset items to use") - parser.add_argument("--count-queries", type=int, help="number of queries to use") - parser.add_argument("--samples-per-query-multistream", default=8, type=int, help="query length for multi-stream scenario (in terms of aggregated samples)") + parser.add_argument( + "--max-latency", + type=float, + help="mlperf max latency in pct tile") + parser.add_argument( + "--count-samples", + type=int, + help="dataset items to use") + parser.add_argument( + "--count-queries", + type=int, + help="number of queries to use") + parser.add_argument( + "--samples-per-query-multistream", + default=8, + type=int, + help="query length for multi-stream scenario (in terms of aggregated samples)") # --samples-per-query-offline is equivalent to perf_sample_count - parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)") - parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one") - parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size") - parser.add_argument("--samples-to-aggregate-max", type=int, help="max number of samples to be treated as one in random query size") - parser.add_argument("--samples-to-aggregate-quantile-file", type=str, help="distribution quantile used to generate number of samples to be treated as one in random query size") - parser.add_argument("--samples-to-aggregate-trace-file", type=str, default="dlrm_trace_of_aggregated_samples.txt") + parser.add_argument( + "--samples-per-query-offline", + type=int, + default=2048, + help="query length for offline scenario (in terms of aggregated samples)") + parser.add_argument( + "--samples-to-aggregate-fix", + type=int, + help="number of samples to be treated as one") + parser.add_argument( + "--samples-to-aggregate-min", + type=int, + help="min number of samples to be treated as one in random query size") + parser.add_argument( + "--samples-to-aggregate-max", + type=int, + help="max number of samples to be treated as one in random query size") + parser.add_argument( + "--samples-to-aggregate-quantile-file", + type=str, + help="distribution quantile used to generate number of samples to be treated as one in random query size") + parser.add_argument( + "--samples-to-aggregate-trace-file", + type=str, + default="dlrm_trace_of_aggregated_samples.txt") parser.add_argument("--numpy-rand-seed", type=int, default=123) parser.add_argument("--debug", action="store_true", default=False) args = parser.parse_args() @@ -170,7 +232,7 @@ def get_backend(backend, dataset, use_gpu, debug): if dataset == "debug": # 1. Syntetic debug dataset backend = BackendDistPytorchNative( - num_embeddings_per_feature = [2 for _ in range(26)], + num_embeddings_per_feature=[2 for _ in range(26)], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -182,7 +244,33 @@ def get_backend(backend, dataset, use_gpu, debug): elif dataset == "multihot-criteo-sample": # 2. Syntetic multihot criteo sample backend = BackendDistPytorchNative( - num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36], + num_embeddings_per_feature=[ + 40000000, + 39060, + 17295, + 7424, + 20265, + 3, + 7122, + 1543, + 63, + 40000000, + 3067956, + 405282, + 10, + 2209, + 11938, + 155, + 4, + 976, + 14, + 40000000, + 40000000, + 40000000, + 590152, + 12973, + 108, + 36], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -194,7 +282,33 @@ def get_backend(backend, dataset, use_gpu, debug): elif dataset == "multihot-criteo": # 3. Syntetic multihot criteo backend = BackendDistPytorchNative( - num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36], + num_embeddings_per_feature=[ + 40000000, + 39060, + 17295, + 7424, + 20265, + 3, + 7122, + 1543, + 63, + 40000000, + 3067956, + 405282, + 10, + 2209, + 11938, + 155, + 4, + 976, + 14, + 40000000, + 40000000, + 40000000, + 590152, + 12973, + 108, + 36], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -204,12 +318,13 @@ def get_backend(backend, dataset, use_gpu, debug): debug=debug ) else: - raise ValueError("only debug|multihot-criteo-sample|multihot-criteo dataset options are supported") + raise ValueError( + "only debug|multihot-criteo-sample|multihot-criteo dataset options are supported") else: if dataset == "debug": # 1. Syntetic debug dataset backend = BackendPytorchNative( - num_embeddings_per_feature = [2 for _ in range(26)], + num_embeddings_per_feature=[2 for _ in range(26)], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -221,7 +336,33 @@ def get_backend(backend, dataset, use_gpu, debug): elif dataset == "multihot-criteo-sample": # 2. Syntetic multihot criteo sample backend = BackendPytorchNative( - num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36], + num_embeddings_per_feature=[ + 40000000, + 39060, + 17295, + 7424, + 20265, + 3, + 7122, + 1543, + 63, + 40000000, + 3067956, + 405282, + 10, + 2209, + 11938, + 155, + 4, + 976, + 14, + 40000000, + 40000000, + 40000000, + 590152, + 12973, + 108, + 36], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -233,7 +374,33 @@ def get_backend(backend, dataset, use_gpu, debug): elif dataset == "multihot-criteo": # 3. Syntetic multihot criteo backend = BackendPytorchNative( - num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36], + num_embeddings_per_feature=[ + 40000000, + 39060, + 17295, + 7424, + 20265, + 3, + 7122, + 1543, + 63, + 40000000, + 3067956, + 405282, + 10, + 2209, + 11938, + 155, + 4, + 976, + 14, + 40000000, + 40000000, + 40000000, + 590152, + 12973, + 108, + 36], embedding_dim=128, dcn_num_layers=3, dcn_low_rank_dim=512, @@ -243,7 +410,8 @@ def get_backend(backend, dataset, use_gpu, debug): debug=debug ) else: - raise ValueError("only debug|multihot-criteo-sample|multihot-criteo dataset options are supported") + raise ValueError( + "only debug|multihot-criteo-sample|multihot-criteo dataset options are supported") else: raise ValueError("unknown backend: " + backend) @@ -253,7 +421,8 @@ def get_backend(backend, dataset, use_gpu, debug): class Item: """An item that we queue for processing by the thread pool.""" - def __init__(self, query_id, content_id, features, batch_T=None, idx_offsets = None): + def __init__(self, query_id, content_id, features, + batch_T=None, idx_offsets=None): self.query_id = query_id self.content_id = content_id self.features = features @@ -261,6 +430,7 @@ def __init__(self, query_id, content_id, features, batch_T=None, idx_offsets = N self.idx_offsets = idx_offsets self.start = time.time() + class RunnerBase: def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): self.take_accuracy = False @@ -285,7 +455,8 @@ def run_one_item(self, qitem): processed_results = [] try: results = self.model.predict(qitem.features, qitem.content_id) - processed_results = self.post_process(results, qitem.batch_T, self.result_dict) + processed_results = self.post_process( + results, qitem.batch_T, self.result_dict) if self.take_accuracy: self.post_process.add_results(processed_results) self.result_timing.append(time.time() - qitem.start) @@ -305,7 +476,8 @@ def run_one_item(self, qitem): # print("s,e:",s_idx,e_idx, len(processed_results)) s_idx = qitem.idx_offsets[idx] e_idx = qitem.idx_offsets[idx + 1] - response_array = array.array("B", np.array(processed_results[s_idx:e_idx], np.float32).tobytes()) + response_array = array.array("B", np.array( + processed_results[s_idx:e_idx], np.float32).tobytes()) response_array_refs.append(response_array) bi = response_array.buffer_info() response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1])) @@ -314,20 +486,27 @@ def run_one_item(self, qitem): def enqueue(self, query_samples): idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] - #print(idx) + # print(idx) query_len = len(query_samples) if query_len < self.max_batchsize: samples, idx_offsets = self.ds.get_samples(idx) batch_T = [self.ds.get_labels(sample) for sample in samples] - self.run_one_item(Item(query_id, idx, samples, batch_T, idx_offsets)) + self.run_one_item( + Item( + query_id, + idx, + samples, + batch_T, + idx_offsets)) else: bs = self.max_batchsize for i in range(0, query_len, bs): ie = min(i + bs, query_len) samples, idx_offsets = self.ds.get_samples(idx[i:ie]) batch_T = [self.ds.get_labels(sample) for sample in samples] - self.run_one_item(Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets)) + self.run_one_item( + Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets)) def finish(self): pass @@ -336,13 +515,16 @@ def finish(self): class QueueRunner(RunnerBase): def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): super().__init__(model, ds, threads, post_proc, max_batchsize) - queue_size_multiplier = 4 #(args.samples_per_query_offline + max_batchsize - 1) // max_batchsize) + # (args.samples_per_query_offline + max_batchsize - 1) // max_batchsize) + queue_size_multiplier = 4 self.tasks = JoinableQueue(maxsize=threads * queue_size_multiplier) self.workers = [] self.result_dict = {} for _ in range(self.threads): - worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) + worker = threading.Thread( + target=self.handle_tasks, args=( + self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -362,7 +544,7 @@ def enqueue(self, query_samples): idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] query_len = len(query_samples) - #print(idx) + # print(idx) if query_len < self.max_batchsize: samples, idx_offsets = self.ds.get_samples(idx) batch_T = [self.ds.get_labels(sample) for sample in samples] @@ -373,7 +555,8 @@ def enqueue(self, query_samples): ie = min(i + bs, query_len) samples, idx_offsets = self.ds.get_samples(idx) batch_T = [self.ds.get_labels(sample) for sample in samples] - self.tasks.put(Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets)) + self.tasks.put( + Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets)) def finish(self): # exit all threads @@ -383,11 +566,12 @@ def finish(self): worker.join() - -def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False): +def add_results(final_results, name, result_dict, + result_list, took, show_accuracy=False): percentiles = [50., 80., 90., 95., 99., 99.9] buckets = np.percentile(result_list, percentiles).tolist() - buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]) + buckets_str = ",".join(["{}:{:.4f}".format(p, b) + for p, b in zip(percentiles, buckets)]) if result_dict["total"] == 0: result_dict["total"] = len(result_list) @@ -426,13 +610,17 @@ def main(): log.info(args) # find backend - backend = get_backend(args.backend, args.dataset, args.use_gpu, debug=args.debug) + backend = get_backend( + args.backend, + args.dataset, + args.use_gpu, + debug=args.debug) # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] # --count-samples can be used to limit the number of samples used for testing - ds = wanted_dataset(num_embeddings_per_feature=[40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,3], + ds = wanted_dataset(num_embeddings_per_feature=[40000000, 39060, 17295, 7424, 20265, 3, 7122, 1543, 63, 40000000, 3067956, 405282, 10, 2209, 11938, 155, 4, 976, 14, 40000000, 40000000, 40000000, 590152, 12973, 108, 3], data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function @@ -445,7 +633,10 @@ def main(): max_ind_range=args.max_ind_range, **kwargs) # load model to backend - model = backend.load(args.model_path, inputs=args.inputs, outputs=args.outputs) + model = backend.load( + args.model_path, + inputs=args.inputs, + outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), @@ -453,11 +644,6 @@ def main(): "cmdline": str(args), } - mlperf_conf = os.path.abspath(args.mlperf_conf) - if not os.path.exists(mlperf_conf): - log.error("{} not found".format(mlperf_conf)) - sys.exit(1) - user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) @@ -489,7 +675,12 @@ def main(): lg.TestScenario.Offline: QueueRunner } - runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) + runner = runner_map[scenario]( + model, + ds, + args.threads, + post_proc=post_proc, + max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) @@ -498,7 +689,8 @@ def flush_queries(): pass settings = lg.TestSettings() - settings.FromConfig(mlperf_conf, args.model_path, args.scenario) + # mlperf_conf is automatically loaded by the loadgen + # settings.FromConfig(mlperf_conf, args.model_path, args.scenario) settings.FromConfig(user_conf, args.model_path, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly @@ -526,13 +718,22 @@ def flush_queries(): if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int( + args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries) - qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples) + qsl = lg.ConstructQSL(count, + min(count, + args.samples_per_query_offline), + ds.load_query_samples, + ds.unload_query_samples) log.info("starting {}".format(scenario)) - result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)} + result_dict = { + "good": 0, + "total": 0, + "roc_auc": 0, + "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) diff --git a/text_to_image/main.py b/text_to_image/main.py index 07cf17472..57bfb56dd 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -73,15 +73,22 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") - parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument( + "--dataset", + choices=SUPPORTED_DATASETS.keys(), + help="dataset") + parser.add_argument( + "--dataset-path", + required=True, + help="path to the dataset") parser.add_argument( "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" ) parser.add_argument( "--scenario", default="SingleStream", - help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), + help="mlperf benchmark scenario, one of " + + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( "--max-batchsize", @@ -90,7 +97,10 @@ def get_args(): help="max batch size in a single inference", ) parser.add_argument("--threads", default=1, type=int, help="threads") - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") parser.add_argument( "--find-peak-performance", action="store_true", @@ -121,10 +131,6 @@ def get_args(): help="framework to load the latents", ) - # file to use mlperf rules compliant parameters - parser.add_argument( - "--mlperf_conf", default="mlperf.conf", help="mlperf rules config" - ) # file for user LoadGen settings such as target QPS parser.add_argument( "--user_conf", @@ -139,9 +145,13 @@ def get_args(): # pass this argument for official submission # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") # do not modify this argument for official submission - parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt") + parser.add_argument( + "--ids-path", + help="Path to caption ids", + default="tools/sample_ids.txt") - # below will override mlperf rules compliant settings - don't use for official submission + # below will override mlperf rules compliant settings - don't use for + # official submission parser.add_argument("--time", type=int, help="time to scan in seconds") parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--debug", action="store_true", help="debug") @@ -259,9 +269,9 @@ def enqueue(self, query_samples): else: bs = self.max_batchsize for i in range(0, len(idx), bs): - data, label = self.ds.get_samples(idx[i : i + bs]) + data, label = self.ds.get_samples(idx[i: i + bs]) self.run_one_item( - Item(query_id[i : i + bs], idx[i : i + bs], data, label) + Item(query_id[i: i + bs], idx[i: i + bs], data, label) ) def finish(self): @@ -276,7 +286,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): self.result_dict = {} for _ in range(self.threads): - worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) + worker = threading.Thread( + target=self.handle_tasks, args=( + self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -366,18 +378,13 @@ def main(): "cmdline": str(args), } - mlperf_conf = os.path.abspath(args.mlperf_conf) - if not os.path.exists(mlperf_conf): - log.error("{} not found".format(mlperf_conf)) - sys.exit(1) - user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) audit_config = os.path.abspath(args.audit_conf) - + if args.accuracy: ids_path = os.path.abspath(args.ids_path) with open(ids_path) as f: @@ -432,7 +439,8 @@ def flush_queries(): log_settings.log_output = log_output_settings settings = lg.TestSettings() - settings.FromConfig(mlperf_conf, args.model_name, args.scenario) + # mlperf.conf is automatically loaded by the loadgen + # settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) if os.path.exists(audit_config): settings.FromConfig(audit_config, args.model_name, args.scenario) @@ -461,7 +469,8 @@ def flush_queries(): settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int( + args.max_latency * NANO_SEC) performance_sample_count = ( args.performance_sample_count diff --git a/vision/classification_and_detection/python/main.py b/vision/classification_and_detection/python/main.py index d6f16467a..a41e0c1df 100755 --- a/vision/classification_and_detection/python/main.py +++ b/vision/classification_and_detection/python/main.py @@ -51,20 +51,20 @@ (coco.Coco, dataset.pre_process_coco_mobilenet, coco.PostProcessCoco(), {"image_size": [300, 300, 3]}), "coco-300-pt": - (coco.Coco, dataset.pre_process_coco_pt_mobilenet, coco.PostProcessCocoPt(False,0.3), + (coco.Coco, dataset.pre_process_coco_pt_mobilenet, coco.PostProcessCocoPt(False, 0.3), {"image_size": [300, 300, 3]}), "openimages-300-retinanet": - (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,300,300), - {"image_size": [300, 300, 3]}), + (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False, 0.05, 300, 300), + {"image_size": [300, 300, 3]}), "openimages-800-retinanet": - (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,800,800), - {"image_size": [800, 800, 3]}), + (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False, 0.05, 800, 800), + {"image_size": [800, 800, 3]}), "openimages-1200-retinanet": - (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,1200,1200), - {"image_size": [1200, 1200, 3]}), + (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False, 0.05, 1200, 1200), + {"image_size": [1200, 1200, 3]}), "openimages-800-retinanet-onnx": - (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,800,800,False), - {"image_size": [800, 800, 3]}), + (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False, 0.05, 800, 800, False), + {"image_size": [800, 800, 3]}), "coco-1200": (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCoco(), {"image_size": [1200, 1200, 3]}), @@ -72,11 +72,11 @@ (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoOnnx(), {"image_size": [1200, 1200, 3]}), "coco-1200-pt": - (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoPt(True,0.05), - {"image_size": [1200, 1200, 3],"use_label_map": True}), + (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoPt(True, 0.05), + {"image_size": [1200, 1200, 3], "use_label_map": True}), "coco-1200-tf": (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoTf(), - {"image_size": [1200, 1200, 3],"use_label_map": False}), + {"image_size": [1200, 1200, 3], "use_label_map": False}), } # pre-defined command line options so simplify things. They are used as defaults and can be @@ -227,44 +227,102 @@ def get_args(): """Parse commandline.""" parser = argparse.ArgumentParser() - parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") - parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument( + "--dataset", + choices=SUPPORTED_DATASETS.keys(), + help="dataset") + parser.add_argument( + "--dataset-path", + required=True, + help="path to the dataset") parser.add_argument("--dataset-list", help="path to the dataset list") - parser.add_argument("--data-format", choices=["NCHW", "NHWC"], help="data format") - parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles") + parser.add_argument( + "--data-format", + choices=[ + "NCHW", + "NHWC"], + help="data format") + parser.add_argument( + "--profile", + choices=SUPPORTED_PROFILES.keys(), + help="standard profiles") parser.add_argument("--scenario", default="SingleStream", help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys()))) - parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference") + parser.add_argument( + "--max-batchsize", + type=int, + help="max batch size in a single inference") parser.add_argument("--model", required=True, help="model file") parser.add_argument("--output", default="output", help="test results") parser.add_argument("--inputs", help="model inputs") parser.add_argument("--outputs", help="model outputs") parser.add_argument("--backend", help="runtime to use") parser.add_argument("--device", help="device to use") - parser.add_argument("--model-name", help="name of the mlperf model, ie. resnet50") - parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads") + parser.add_argument( + "--model-name", + help="name of the mlperf model, ie. resnet50") + parser.add_argument( + "--threads", + default=os.cpu_count(), + type=int, + help="threads") parser.add_argument("--qps", type=int, help="target qps") parser.add_argument("--cache", type=int, default=0, help="use cache") - parser.add_argument("--cache_dir", type=str, default=None, help="dir path for caching") - parser.add_argument("--preprocessed_dir", type=str, default=None, help="dir path for storing preprocessed images (overrides cache_dir)") - parser.add_argument("--use_preprocessed_dataset", action="store_true", help="use preprocessed dataset instead of the original") - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") - parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass") - parser.add_argument("--debug", action="store_true", help="debug, turn traces on") - - # file to use mlperf rules compliant parameters - parser.add_argument("--mlperf_conf", default="../../mlperf.conf", help="mlperf rules config") + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="dir path for caching") + parser.add_argument( + "--preprocessed_dir", + type=str, + default=None, + help="dir path for storing preprocessed images (overrides cache_dir)") + parser.add_argument( + "--use_preprocessed_dataset", + action="store_true", + help="use preprocessed dataset instead of the original") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") + parser.add_argument( + "--find-peak-performance", + action="store_true", + help="enable finding peak performance pass") + parser.add_argument( + "--debug", + action="store_true", + help="debug, turn traces on") + # file for user LoadGen settings such as target QPS - parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS") # file for LoadGen audit settings - parser.add_argument("--audit_conf", default="audit.config", help="config for LoadGen audit settings") + parser.add_argument( + "--audit_conf", + default="audit.config", + help="config for LoadGen audit settings") - # below will override mlperf rules compliant settings - don't use for official submission + # below will override mlperf rules compliant settings - don't use for + # official submission parser.add_argument("--time", type=int, help="time to scan in seconds") parser.add_argument("--count", type=int, help="dataset items to use") - parser.add_argument("--performance-sample-count", type=int, help="performance sample count") - parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") - parser.add_argument("--samples-per-query", default=8, type=int, help="mlperf multi-stream samples per query") + parser.add_argument( + "--performance-sample-count", + type=int, + help="performance sample count") + parser.add_argument( + "--max-latency", + type=float, + help="mlperf max latency in pct tile") + parser.add_argument( + "--samples-per-query", + default=8, + type=int, + help="mlperf multi-stream samples per query") args = parser.parse_args() # don't use defaults in argparser. Instead we default to a dict, override that with a profile @@ -306,7 +364,7 @@ def get_backend(backend): backend = BackendPytorch() elif backend == "pytorch-native": from backend_pytorch_native import BackendPytorchNative - backend = BackendPytorchNative() + backend = BackendPytorchNative() elif backend == "tflite": from backend_tflite import BackendTflite backend = BackendTflite() @@ -354,7 +412,8 @@ def run_one_item(self, qitem): processed_results = [] try: results = self.model.predict({self.model.inputs[0]: qitem.img}) - processed_results = self.post_process(results, qitem.content_id, qitem.label, self.result_dict) + processed_results = self.post_process( + results, qitem.content_id, qitem.label, self.result_dict) if self.take_accuracy: self.post_process.add_results(processed_results) self.result_timing.append(time.time() - qitem.start) @@ -367,7 +426,8 @@ def run_one_item(self, qitem): response_array_refs = [] response = [] for idx, query_id in enumerate(qitem.query_id): - response_array = array.array("B", np.array(processed_results[idx], np.float32).tobytes()) + response_array = array.array("B", np.array( + processed_results[idx], np.float32).tobytes()) response_array_refs.append(response_array) bi = response_array.buffer_info() response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1])) @@ -382,8 +442,9 @@ def enqueue(self, query_samples): else: bs = self.max_batchsize for i in range(0, len(idx), bs): - data, label = self.ds.get_samples(idx[i:i+bs]) - self.run_one_item(Item(query_id[i:i+bs], idx[i:i+bs], data, label)) + data, label = self.ds.get_samples(idx[i:i + bs]) + self.run_one_item( + Item(query_id[i:i + bs], idx[i:i + bs], data, label)) def finish(self): pass @@ -397,7 +458,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): self.result_dict = {} for _ in range(self.threads): - worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) + worker = threading.Thread( + target=self.handle_tasks, args=( + self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -434,10 +497,12 @@ def finish(self): worker.join() -def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False): +def add_results(final_results, name, result_dict, + result_list, took, show_accuracy=False): percentiles = [50., 80., 90., 95., 99., 99.9] buckets = np.percentile(result_list, percentiles).tolist() - buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]) + buckets_str = ",".join(["{}:{:.4f}".format(p, b) + for p, b in zip(percentiles, buckets)]) if result_dict["total"] == 0: result_dict["total"] = len(result_list) @@ -478,7 +543,7 @@ def main(): # find backend backend = get_backend(args.backend) - # If TVM, pass max_batchsize to the backend + # If TVM, pass max_batchsize to the backend if args.backend.startswith('tvm'): backend.max_batchsize = args.max_batchsize backend.arena_num = args.threads @@ -497,7 +562,7 @@ def main(): # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] if args.use_preprocessed_dataset: - pre_proc=None + pre_proc = None ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, @@ -511,9 +576,16 @@ def main(): **kwargs) # load model to backend if args.device == "tpu": - model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs, use_tpu=True) + model = backend.load( + args.model, + inputs=args.inputs, + outputs=args.outputs, + use_tpu=True) else: - model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) + model = backend.load( + args.model, + inputs=args.inputs, + outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), @@ -522,11 +594,6 @@ def main(): "cmdline": str(args), } - mlperf_conf = os.path.abspath(args.mlperf_conf) - if not os.path.exists(mlperf_conf): - log.error("{} not found".format(mlperf_conf)) - sys.exit(1) - user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) @@ -558,7 +625,12 @@ def main(): lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } - runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) + runner = runner_map[scenario]( + model, + ds, + args.threads, + post_proc=post_proc, + max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) @@ -574,7 +646,8 @@ def flush_queries(): log_settings.log_output = log_output_settings settings = lg.TestSettings() - settings.FromConfig(mlperf_conf, args.model_name, args.scenario) + # mlperf.conf is automatically loaded by the loadgen + # settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly @@ -601,11 +674,17 @@ def flush_queries(): settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int( + args.max_latency * NANO_SEC) - performance_sample_count = args.performance_sample_count if args.performance_sample_count else min(count, 500) + performance_sample_count = args.performance_sample_count if args.performance_sample_count else min( + count, 500) sut = lg.ConstructSUT(issue_queries, flush_queries) - qsl = lg.ConstructQSL(count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples) + qsl = lg.ConstructQSL( + count, + performance_sample_count, + ds.load_query_samples, + ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} diff --git a/vision/classification_and_detection/run_local.sh b/vision/classification_and_detection/run_local.sh index 120b538c7..680160d33 100755 --- a/vision/classification_and_detection/run_local.sh +++ b/vision/classification_and_detection/run_local.sh @@ -2,7 +2,7 @@ source ./run_common.sh -common_opt="--mlperf_conf ../../mlperf.conf" +common_opt="" dataset="--dataset-path $DATA_DIR" OUTPUT_DIR=${OUTPUT_DIR:-`pwd`/output/$name} if [ ! -d "$OUTPUT_DIR" ]; then From f74d16f54131d9080b9e45f234cc23e0ebaaf20c Mon Sep 17 00:00:00 2001 From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:38:02 -0700 Subject: [PATCH 2/2] Apply min_new_tokens=2 to mixtral-8x7b, address #1777 (#1884) --- compliance/nvidia/TEST06/README.md | 2 +- compliance/nvidia/TEST06/run_verification.py | 3 +-- language/mixtral-8x7b/README.md | 13 ++++++++----- language/mixtral-8x7b/SUT.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/compliance/nvidia/TEST06/README.md b/compliance/nvidia/TEST06/README.md index 9ca041cbd..0fd29dd39 100644 --- a/compliance/nvidia/TEST06/README.md +++ b/compliance/nvidia/TEST06/README.md @@ -10,7 +10,7 @@ This repository provides the config files and scripts to run and verify TEST 06 The purpose of this test is to ensure the consistency of the output of the LLM (Llama2 and Mixtral) model and avoid a potential EOS exploit. This test will make a performance run, with a limit of 100 samples and logging them into `mlperf_log_accuracy.json`. To achieve a passing result in this test, three criteria must be met: - In the case the first token is reported independently (not applicable for Offline scenario), it should match for every query with the first token of the model output. -- For each query, the model output should only end with zero or one EOS token. The only exception for 2 EOS tokens is when the entire output sequences are EOS tokens (i.e. output is [eos_token_id, eos_token_id]) +- For each query, the model output should only end with zero or one EOS token. - The number of reported tokens should match with the length of output sequence. ## Requisites diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py index 092672a9a..0f57d01f8 100644 --- a/compliance/nvidia/TEST06/run_verification.py +++ b/compliance/nvidia/TEST06/run_verification.py @@ -51,8 +51,7 @@ def eos_check(acc_data, dtype, eos_token_id=2): if data[i] == eos_token_id: n_eos_tokens += 1 if n_eos_tokens >= 2: - # Allow output to be [eos_token_id, eos_token_id] - return len(data) == 2 + return False if data[i] != eos_token_id: break i-=1 diff --git a/language/mixtral-8x7b/README.md b/language/mixtral-8x7b/README.md index 341ae539a..7b20b6ba0 100644 --- a/language/mixtral-8x7b/README.md +++ b/language/mixtral-8x7b/README.md @@ -109,6 +109,9 @@ rclone copyurl https://inference.mlcommons-storage.org/mixtral_8x7b%2F2024.06.06 #### Using wget Alternatively, you can simply cd into the folder where you want to place the dataset and run + +TBD: The dataset is being replaced in v5.0 due to https://github.com/mlcommons/inference/issues/1777 + ```bash wget https://inference.mlcommons-storage.org/mixtral_8x7b%2F2024.06.06_mixtral_15k_v4.pkl ``` @@ -261,17 +264,17 @@ python -u evaluate-accuracy.py --checkpoint-path [path_to_model_checkpoint] \ Reference scores: Open Orca: ```json -{'rouge1': 45.4911, 'rouge2': 23.2829, 'rougeL': 30.3615} +{'rouge1': 45.5989, 'rouge2': 23.3526, 'rougeL': 30.4608} ``` GSM8K: ```json -{'gsm8k': 73.78} +{'gsm8k': 73.66} ``` MBXP: ```json -{'mbxp': 60.12} +{'mbxp': 60.16} ``` -For official submissions, 99% of each reference score is enforced. Additionally, 90%-110% of the generated tokens_per_samples: +For official submissions, 99% of each reference score is enforced. Additionally, 90%-110% of the generated tokens_per_samples (counting all the non-EOS tokens): ```json -{'tokens_per_sample': 145.9} +{'tokens_per_sample': 144.84} ``` diff --git a/language/mixtral-8x7b/SUT.py b/language/mixtral-8x7b/SUT.py index 536ba46f5..5c487a630 100644 --- a/language/mixtral-8x7b/SUT.py +++ b/language/mixtral-8x7b/SUT.py @@ -27,7 +27,7 @@ gen_kwargs = { "early_stopping": True, "max_new_tokens": 1024, - "min_new_tokens": 1, + "min_new_tokens": 2, "num_beams": 1, "do_sample": False }