From c92f72ebebf5f4a1e63b726e6e5cec1a47250bb5 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 11:59:03 -0500
Subject: [PATCH 01/15] Merge Linux Nuget GPU pipeline with zip-nuget (#19120)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 174 ++----------------
 .../nuget-linux-cuda-packaging-stage.yml      |  18 +-
 2 files changed, 31 insertions(+), 161 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index f80b035582f18..2169a3ce1bb9e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,6 +83,16 @@ resources:
 variables:
 - name: ReleaseVersionSuffix
   value: ''
+- name: docker_base_image
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+- name: linux_trt_version
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: 8.6.1.6-1.cuda11.8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: 8.6.1.6-1.cuda12.0
 
 stages:
 - stage: Setup
@@ -189,64 +199,11 @@ stages:
     AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
     BuildVariant: 'default'
 
-- stage: Linux_C_API_Packaging_GPU_x64
-  dependsOn: []
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    timeoutInMinutes: 120
-    pool: 'Onnxruntime-Linux-GPU'
-    variables:
-      - name: CUDA_VERSION_MAJOR
-        ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: '11'
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: '12'
-      - name: CUDA_VERSION
-        value: ${{ parameters.CudaVersion }}
-    steps:
-    - template: templates/set-version-number-variables-step.yml
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build
-
-    - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
-      workingDirectory: $(Build.SourcesDirectory)
-      displayName: 'Build and Test'
-
-    - template: templates/java-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-          arch: 'linux-x64'
-          buildConfig: 'Release'
-          artifactName: 'onnxruntime-java-linux-x64-cuda'
-          version: '$(OnnxRuntimeVersion)'
-          libraryName: 'libonnxruntime.so'
-          nativeLibraryName: 'libonnxruntime4j_jni.so'
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-- template: templates/linux-gpu-tensorrt-packaging-pipeline.yml
+- template: stages/nuget-linux-cuda-packaging-stage.yml
   parameters:
-      artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
-      artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-      buildJava: true
-      buildJavaOption: '--build_java'
-      buildNodejs: true
-      buildNodejsOption: '--build_nodejs'
+    CudaVersion: ${{ parameters.CudaVersion }}
+    docker_base_image: ${{ variables.docker_base_image }}
+    linux_trt_version: ${{ variables.linux_trt_version }}
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
@@ -527,109 +484,6 @@ stages:
       displayName: 'Clean Agent Directories'
       condition: always()
 
-- stage: Linux_Packaging_combined_GPU
-  dependsOn:
-  - Linux_C_API_Packaging_GPU_x64
-  - Linux_C_API_Packaging_GPU_TensorRT_x64
-  condition: succeeded()
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    pool: 'Onnxruntime-Linux-GPU'
-
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: false
-    - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
-      submodules: false
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
-      submodules: false
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-
-    - script: |
-        set -e -x
-        cd $(Build.SourcesDirectory)
-        mv manylinux onnxruntime
-        ls
-
-    - template: templates/with-container-registry-steps.yml
-      parameters:
-        Steps:
-        - script: |
-            tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
-              --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 --build-arg BUILD_UID=$( id -u )" \
-              --container-registry onnxruntimebuildcache \
-              --multiple_repos \
-              --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
-          workingDirectory: $(Build.SourcesDirectory)/onnxruntime
-        ContainerRegistry: onnxruntimebuildcache
-
-    - template: templates/set-version-number-variables-step.yml
-      parameters:
-        versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-cuda'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-tensorrt'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ShellScript@2
-      displayName: 'Shell Script'
-      inputs:
-        scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh'
-        args: '-a $(Build.BinariesDirectory)/tgz-artifacts'
-        workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu'
-        includeRootFolder: false
-        archiveType: 'tar' # Options: zip, 7z, tar, wim
-        tarCompression: 'gz'
-        archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        replaceExistingArchive: true
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'tarball'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-
-    - task: CmdLine@2
-      displayName: 'Test C API application for GPU package'
-      inputs:
-        script: |
-          docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/models:/data/models --volume $(Build.SourcesDirectory):/src_dir \
-          --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
-          /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-    - task: PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        artifactName: 'onnxruntime-linux-x64-gpu'
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 48a6e0e8529e6..dbbc9ef27e513 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -40,7 +40,16 @@ stages:
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
       workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build and Test'
-
+# We only support Maven package for CUDA 11.8
+    - ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      - template: ../templates/java-api-artifacts-package-and-publish-steps-posix.yml
+        parameters:
+          arch: 'linux-x64'
+          buildConfig: 'Release'
+          artifactName: 'onnxruntime-java-linux-x64-cuda'
+          version: '$(OnnxRuntimeVersion)'
+          libraryName: 'libonnxruntime.so'
+          nativeLibraryName: 'libonnxruntime4j_jni.so'
     - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml
       parameters:
         buildConfig: 'Release'
@@ -82,6 +91,10 @@ stages:
         - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
           submodules: false
 
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
         - script: |
             set -e -x
             cd $(Build.SourcesDirectory)
@@ -159,3 +172,6 @@ stages:
           inputs:
             targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
             artifactName: 'onnxruntime-linux-x64-gpu'
+        - template: ../templates/component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
\ No newline at end of file

From e2e488d6f8bcd14f40e9e2c8e65f310ce9c0e872 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 09:18:35 -0800
Subject: [PATCH 02/15] Revert "iOS packaging pipeline stability" (#19135)

Reverts microsoft/onnxruntime#19097 because it broken Android CI
pipeline.
---
 .../external/onnxruntime_external_deps.cmake  | 74 +++++++++----------
 .../mac-ios-packaging-pipeline.yml            |  2 +-
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 3 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c79bb87fd7f5d..78f63227c8392 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,14 +108,41 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  if (APPLE)
-    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
-    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
-    # To keep it simple, just download and use the universal protoc binary for Apple builds.
+  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -123,38 +150,6 @@ if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
-  elseif(CMAKE_CROSSCOMPILING)
-    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    endif()
   endif()
 endif()
 
@@ -189,9 +184,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause
+#TODO: we'd better to turn the following option off. However, it will cause 
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -567,3 +562,4 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 34a51649fc384..5fd15b64e03b6 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-12"  # macOS-13 seems less stable. macOS-12 will work for this job.
+      vmImage: "macOS-13"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index ed32c5d0e15be..d1dff0769e25f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,6 +78,10 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
+    - script: |
+        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
+      displayName: "Build Host Protoc"
+
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -87,7 +91,8 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }}
+          ${{ variables.optionalIncludeOpsByConfigOption }} \
+          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |

From 80f274ca6f2f4572d827edd6dc7f736d7a8c036a Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 16 Jan 2024 09:42:59 -0800
Subject: [PATCH 03/15] Fix SkipLayerNormalization shape inference (#18724)

SkipLayerNorm has more than one input, so `propagateShapeAndTypeFromFirstInput` is not enough.
---
 .../core/graph/contrib_ops/bert_defs.cc       |  4 +-
 .../contrib_ops/shape_inference_functions.cc  | 39 +++++++++++++++++++
 .../contrib_ops/shape_inference_functions.h   |  3 +-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index df8d0a59cb033..0317ffcfb0e31 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1285,7 +1285,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(3, "input_skip_bias_sum", "Sum of the input and skip inputs (and bias if it exists) with shape (batch_size, sequence_length, hidden_size).", "T", OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
     SkipSimplifiedLayerNormalization, 1,
@@ -1334,7 +1334,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 constexpr const char* NGramRepeatBlock_ver1_doc = R"DOC(
 Enforce no repetition of n-grams. Scores are set to `-inf` for tokens that form a repeated n-gram if added to the back of the input_ids.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
index eeef20e9dff5e..8b1812f62be25 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
@@ -114,6 +114,45 @@ void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& c
   }
 }
 
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx) {
+  propagateShapeAndTypeFromFirstInput(ctx);
+
+  auto stash_type = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  if (ctx.getNumOutputs() > 1) {
+    auto output_type = ctx.getOutputType(1);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 2) {
+    auto output_type = ctx.getOutputType(2);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 3) {
+    propagateElemTypeFromInputToOutput(ctx, 0, 3);
+  }
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+  auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+  int64_t input_ndim = input_shape.dim_size();
+  int axis = static_cast<int>(input_ndim - 1);
+
+  if (ctx.getNumOutputs() > 1) {
+    auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+    mean_shape->CopyFrom(input_shape);
+    mean_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 2) {
+    auto inv_std_dev_shape = ctx.getOutputType(2)->mutable_tensor_type()->mutable_shape();
+    inv_std_dev_shape->CopyFrom(input_shape);
+    inv_std_dev_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 3) {
+    propagateShapeFromInputToOutput(ctx, 0, 3);
+  }
+}
+
 // Shape inference for Attention and QAttention
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index) {
   // Input 0, 1, 2 are input, weights and bias.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
index 93cf5b304f653..6eb06af15309c 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
@@ -13,5 +13,6 @@ namespace onnxruntime {
 namespace contrib {
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index);
 void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
 }  // namespace contrib
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime

From 8e272b9cac70a11c472fb002af755213a4dabf66 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 16:53:15 -0500
Subject: [PATCH 04/15] Update build.py to remove unused functions and update
 python to 3.8 (#19164)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/ci_build/build.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 0da4adb51767d..1a6262edf45c9 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -56,7 +56,7 @@ def __init__(self, message):
 
 
 def _check_python_version():
-    required_minor_version = 7
+    required_minor_version = 8
     if (sys.version_info.major, sys.version_info.minor) < (3, required_minor_version):
         raise UsageError(
             f"Invalid Python version. At least Python 3.{required_minor_version} is required. "
@@ -786,11 +786,6 @@ def get_linux_distro():
         return "", ""
 
 
-def is_ubuntu_1604():
-    dist, ver = get_linux_distro()
-    return dist == "Ubuntu" and ver.startswith("16.04")
-
-
 def get_config_build_dir(build_dir, config):
     # build directory per configuration
     return os.path.join(build_dir, config)
@@ -844,15 +839,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def is_docker():
-    path = "/proc/self/cgroup"
-    return (
-        os.path.exists("/.dockerenv")
-        or os.path.isfile(path)
-        and any("docker" in line for line in open(path))  # noqa: SIM115
-    )
-
-
 def install_python_deps(numpy_version=""):
     dep_packages = ["setuptools", "wheel", "pytest"]
     dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
@@ -2401,16 +2387,6 @@ def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt
     run_subprocess(cmd_args, cwd=csharp_source_dir)
 
 
-def is_cross_compiling_on_apple(args):
-    if not is_macOS():
-        return False
-    if args.ios:
-        return True
-    if args.osx_arch != platform.machine():
-        return True
-    return False
-
-
 def generate_documentation(source_dir, build_dir, configs, validate):
     # Randomly choose one build config
     config = next(iter(configs))
@@ -2725,12 +2701,6 @@ def main():
             log.info("Activating emsdk...")
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
-        if is_ubuntu_1604():
-            if args.arm or args.arm64:
-                raise BuildError("Only Windows ARM(64) cross-compiled builds supported currently through this script")
-            if not is_docker() and not args.use_acl and not args.use_armnn:
-                install_python_deps()
-
         if args.enable_pybind and is_windows():
             install_python_deps(args.numpy_version)
 

From c935c8fbd2e463a3e0153145140a8efd780dfabc Mon Sep 17 00:00:00 2001
From: moyo1997 <54333118+moyo1997@users.noreply.github.com>
Date: Tue, 16 Jan 2024 16:24:37 -0800
Subject: [PATCH 05/15] remove unnecessary environment variable (#19166)

remove unnecessary environment variable when building as arm64x
---
 build_arm64x.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/build_arm64x.bat b/build_arm64x.bat
index fbcdd373086a9..1ed268ae94a43 100644
--- a/build_arm64x.bat
+++ b/build_arm64x.bat
@@ -5,7 +5,6 @@
 
 setlocal
 set PATH=C:\Program Files\Git\usr\bin;%PATH%
-set LINK_REPRO_NAME=/mylink.rsp
 
 rem Requires a Python install to be available in your PATH
 python "%~dp0\tools\ci_build\build.py" --arm64 --buildasx  --build_dir "%~dp0\build\arm64-x" %*

From e61861b0a121bca1d60e5d4a3722e52b6820c430 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 16 Jan 2024 16:36:28 -0800
Subject: [PATCH 06/15] Clean up generated files in QNN UTs (#19127)

### Description
Clean up generated files in QNN UTs
---
 onnxruntime/test/providers/qnn/simple_op_htp_test.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 8ff65c08e8633..c4244fe532456 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -815,7 +815,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
   // Check the Onnx skeleton file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
   // Check the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists("qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"));
+  std::string qnn_ctx_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  EXPECT_TRUE(std::filesystem::exists(qnn_ctx_bin));
 
   // 2nd run loads and run from QDQ model + Onnx skeleton file + Qnn context cache binary file
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
@@ -837,6 +838,10 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(qnn_ctx_bin.c_str()), 0);
 }
 
 // Run QDQ model on HTP 2 times
@@ -898,6 +903,9 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP with 2 inputs
@@ -955,6 +963,8 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {

From 81d363045ba273b16a3ec654c53a15217a2d2a36 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 17:25:18 -0800
Subject: [PATCH 07/15] Upgrade Ubuntu machine pool from 20.04 to 22.04
 (#19117)

### Description
Upgrade Ubuntu machine pool from 20.04 to 22.04
---
 .../build-perf-test-binaries-pipeline.yml     |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 +-
 ...lean-build-docker-image-cache-pipeline.yml | 10 +--------
 .../cuda-packaging-pipeline.yml               |  2 +-
 .../azure-pipelines/linux-ci-pipeline.yml     |  4 ++--
 .../linux-cpu-aten-pipeline.yml               |  2 +-
 .../linux-cpu-eager-pipeline.yml              |  2 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  2 +-
 .../linux-migraphx-ci-pipeline.yml            |  2 +-
 .../npm-packaging-pipeline.yml                |  4 ++--
 .../nuget/templates/test_linux.yml            |  2 +-
 .../orttraining-linux-ci-pipeline.yml         |  2 +-
 .../orttraining-pai-ci-pipeline.yml           |  4 ++--
 .../orttraining-py-packaging-pipeline-cpu.yml |  2 +-
 .../azure-pipelines/post-merge-jobs.yml       |  6 ++---
 .../py-package-test-pipeline.yml              |  2 +-
 .../stages/py-cuda-packaging-stage.yml        |  2 +-
 .../stages/py-cuda-publishing-stage.yml       |  2 +-
 .../templates/android-java-api-aar.yml        |  2 +-
 .../templates/build-linux-wasm-step.yml       | 22 +++++++++----------
 .../azure-pipelines/templates/c-api-cpu.yml   |  4 ++--
 .../templates/c-api-linux-cpu.yml             |  2 +-
 .../azure-pipelines/templates/linux-ci.yml    |  2 +-
 .../linux-cpu-packaging-pipeline.yml          |  2 +-
 .../templates/linux-wasm-ci.yml               |  2 +-
 ...device-training-cpu-packaging-pipeline.yml |  2 +-
 .../py-packaging-selectable-stage.yml         |  2 +-
 .../templates/py-packaging-stage.yml          |  4 ++--
 .../github/azure-pipelines/templates/rocm.yml |  2 +-
 .../azure-pipelines/web-ci-pipeline.yml       |  2 +-
 .../linux/build_linux_python_package.sh       |  6 ++---
 .../ci_build/github/linux/run_python_tests.sh |  2 +-
 32 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 3ddc167bc0a61..d37e9bdc5da4c 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -28,7 +28,7 @@ stages:
         artifactName: 'onnxruntime-android-full-aar'
         job_name_suffix: 'Full'
         publish_executables: '1'
-        pool_name: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 # build Python packages
 # Linux GPU only
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 2169a3ce1bb9e..3803333bd880a 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -246,7 +246,7 @@ stages:
     workspace:
       clean: all
     timeoutInMinutes: 120
-    pool: onnxruntime-Ubuntu2004-AMD-CPU
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     variables:
       RocmVersion: '5.6'
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
index 24086b6166fe4..43e668eef8d00 100644
--- a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
@@ -19,8 +19,7 @@ variables:
 jobs:
 - job: Clean_Build_Docker_Image_Cache
 
-  pool:
-    vmImage: 'ubuntu-20.04'
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
 
   timeoutInMinutes: 30
 
@@ -29,13 +28,6 @@ jobs:
     submodules: false
     fetchDepth: 1
 
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.9'
-      addToPath: true
-      architecture: 'x64'
-    displayName: "Use Python 3.9"
-
   - task: AzureCLI@2
     inputs:
       azureSubscription: 'AIInfraBuild'
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index df7b5f59d28fc..1d2ba88652f48 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -126,7 +126,7 @@ stages:
         BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
         OnnxruntimeNodejsBindingArch: 'x64'
-        PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
         PackageJava: false
         PackageNodeJS: false
   # Nuget Packaging
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 07f672c75d029..cff7c96aa9253 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -46,7 +46,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
@@ -123,7 +123,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
index 146186e9eeaf5..090ce97296687 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
@@ -43,7 +43,7 @@ jobs:
   variables:
     CCACHE_DIR: $(Agent.TempDirectory)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
index a5c08e95b7efc..d3d13cc5344da 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
@@ -51,7 +51,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - checkout: self
     clean: true
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0993a81a02249..5bc8c3603ee92 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -64,7 +64,7 @@ jobs:
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f7571a3b7eab6..9cf7a3fb42397 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -46,7 +46,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
index 7f73da23b5eb1..21fc205c72e89 100644
--- a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
@@ -41,7 +41,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: true
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-web'
     ExtraBuildArgs: ''
     UseWebPoolName: true
@@ -54,7 +54,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-react-native'
     BuildAndroidAARStageDependsOn: 'Precheck_and_extract_commit'
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index f44106c145228..2567bec9fdfc2 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ArtifactSuffix: ''
   NugetPackageName : ''
   StageSuffix: 'CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 018672e0b2dea..26fd5e1ec0b5d 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -44,7 +44,7 @@ jobs:
     skipComponentGovernanceDetection: true
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu-2004-Training-CPU
+  pool: onnxruntime-Ubuntu-2204-Training-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index a53f91fb317cb..71b224b65964f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -37,7 +37,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
@@ -132,7 +132,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 817ace0571837..a44a8c215939f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -16,7 +16,7 @@ stages:
       timeoutInMinutes: 180
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
 
       strategy:
         matrix:
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 5ee39876733e2..3ec5400dacc65 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -4,7 +4,7 @@ stages:
     parameters:
       NpmPackagingMode: 'dev'
       IsReleasePipeline: true
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       BuildStaticLib: true
       ExtraBuildArgs: ''
       UseWebPoolName: true
@@ -367,7 +367,7 @@ stages:
     timeoutInMinutes: 150
     variables:
       skipComponentGovernanceDetection: true
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     steps:
     - template: templates/set-version-number-variables-step.yml
 
@@ -413,7 +413,7 @@ stages:
   - job: AndroidCustomBuildScript
     workspace:
       clean: all
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     variables:
       dockerImageTag: onnxruntime-android-custom-build
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 55d3150f21aa3..04f555deb1a22 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'x86_64'
-      machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
       base_image: 'registry.access.redhat.com/ubi8/ubi'
       devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
       ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index e6d8ee35e75e3..f82c80d4d7e93 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -105,7 +105,7 @@ stages:
       - template: ../templates/py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           docker_base_image: ${{ variables.docker_base_image }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
index 4f440e0f61b3d..2a4debcf9fba5 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
@@ -20,7 +20,7 @@ stages:
       dependsOn: []
     jobs:
       - job:
-        pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
         steps:
           - checkout: none
           - task: DownloadPipelineArtifact@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 5e61f88b4aa18..509fea45ebe53 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -33,7 +33,7 @@ parameters:
 - name: pool_name
   displayName: Pool name
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: packageName
   # now we can build onnxruntime or onnxruntime-mobile for Android, need specify it here
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
index e664cf69dec76..e77b1a4008b7c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
@@ -24,19 +24,17 @@ parameters:
   type: string
 
 steps:
-  - task: Cache@2
-    inputs:
-      ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
-        key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
-      ${{else}}:
-        key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
-      path: ${{parameters.CacheDir}}
-      restoreKeys: |
-        "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
-    displayName: Cache Task
-    condition: eq('${{parameters.WithCache}}', true)
-
   - ${{if eq(parameters.WithCache, true)}}:
+    - task: Cache@2
+      inputs:
+        ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
+          key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
+        ${{else}}:
+          key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
+        path: ${{parameters.CacheDir}}
+        restoreKeys: |
+          "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
+      displayName: Cache Task
     - script: |
         set -e -x
         pushd '$(Build.SourcesDirectory)/cmake/external/emsdk'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 81319e07c6b17..168602a17910b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -759,7 +759,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime'
     ArtifactSuffix: 'CPU'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
@@ -796,7 +796,7 @@ stages:
     OS: Linux
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - template: final-jar-testing.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 8538f15e93753..cf470b3fa2448 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -19,7 +19,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: ArtifactNamePrefix
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
index 7b9788d90b17d..15165e3cb0950 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool : 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool : 'onnxruntime-Ubuntu2204-AMD-CPU'
   StageName : 'Linux_CI_Dev'
   RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'
   NuPackScript: ''
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 6ad5f9f38a4db..8972d55f6e190 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -32,7 +32,7 @@ stages:
       BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
       OnnxruntimeNodejsBindingArch: 'x64'
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
       PackageJava: ${{ parameters.PackageJava }}
       PackageNodeJS: ${{ parameters.PackageNodeJS }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index e6693a6f6d26a..d279e667f9091 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -13,7 +13,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: SkipPublish
   type: boolean
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 51583a25f63ac..cf39be23cbdaf 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -336,7 +336,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Training'
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 00ba5ea4a475a..01cab936aa529 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -48,7 +48,7 @@ stages:
       timeoutInMinutes: 90
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       strategy:
         matrix:
           ${{ each PythonVersion in parameters.python_version }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index abe06e80f4f19..8669a883c31f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -430,7 +430,7 @@ stages:
       - template: py-linux.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           base_image: 'registry.access.redhat.com/ubi8/ubi'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
@@ -443,6 +443,6 @@ stages:
       - template: py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index 2e9e6c6b35a2e..43a80aa4fd4e3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -14,7 +14,7 @@ jobs:
   workspace:
     clean: all
   timeoutInMinutes: 180
-  pool: Ubuntu-2004-rocm-aiinfra
+  pool: Ubuntu-2204-rocm-aiinfra
   variables:
     - name: PythonVersion
       value: ${{ parameters.PythonVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index e352a04068ee8..24809ccfdec1f 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: false
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     BuildStaticLib: true
     ExtraBuildArgs: $(ExtraBuildArgs)
     WASMTemplate: linux-wasm-ci.yml
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 1059dd5047477..933d1f3d5874a 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -7,9 +7,9 @@ mkdir -p /build/dist
 
 EXTRA_ARG=""
 
-# Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
-# config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8")
+# Put 3.8 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this 
+# config to Azure DevOps Artifacts and download them to a Ubuntu 22.04 machine to run the tests.
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp310-cp310/bin/python3.10")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index 3164a10a09dfd..082c561dd17b9 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -15,7 +15,7 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 cd /build
 files=(whl/*.whl)
 FILE_NAME="${files[0]}"

From 07d3aed3aa3a054deb502cedf867f559fc690755 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 17 Jan 2024 13:35:13 +0800
Subject: [PATCH 08/15] [WebNN EP] Fixed build issue with disable_rtti (#19173)

Previously building webnn ep with --disable_rtti will throw
unboundTypeError since unbound type names are illegal with RTTI disabled
in Embind API, we can fix it by adding a
-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 flag.
---
 cmake/adjust_global_compile_flags.cmake | 5 +++++
 cmake/onnxruntime_webassembly.cmake     | 5 ++++-
 tools/ci_build/build.py                 | 4 ----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 30d8cbf78fb1a..2c7bf9f1c2f5c 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -123,6 +123,11 @@ if (onnxruntime_DISABLE_RTTI)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/GR->" "$<$<COMPILE_LANGUAGE:CXX>:/we4541>")
   else()
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>")
+    if (onnxruntime_USE_WEBNN)
+      # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
+      # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
+      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0>")
+    endif()
   endif()
 else()
   #MSVC RTTI flag /GR is not added to CMAKE_CXX_FLAGS by default. But, anyway VC++2019 treats "/GR" default on.
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 858583e64e9df..546d50c1ca2d3 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -268,7 +268,10 @@ else()
   endif()
 
   if (onnxruntime_USE_WEBNN)
-   set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    if (onnxruntime_DISABLE_RTTI)
+      set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -fno-rtti -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+    endif()
   endif()
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 1a6262edf45c9..1034a82cb2854 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1283,10 +1283,6 @@ def generate_build_tree(
     if args.use_webnn:
         if not args.build_wasm:
             raise BuildError("WebNN is only available for WebAssembly build.")
-        if args.disable_rtti:
-            # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
-            # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/16911
-            raise BuildError("WebNN is not supported with RTTI disabled.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
     if args.use_snpe:

From 9876cc7c4f5f6249e1dec8b93abf7b8dfcf5ca0c Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 17 Jan 2024 15:46:19 +0800
Subject: [PATCH 09/15] more inputs support for LLM exporter (#19005)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../transformers/large_model_exporter.py      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 1601b1a203b9a..9e8b284bf56c7 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -224,24 +224,35 @@ def fetch_onnx_inputs_outputs_name(
     if not num_of_past_key:
         num_of_past_key = model.config.num_hidden_layers
 
-    onnx_inp_names = ("input_ids", "attention_mask")
+    # filter out constant inputs
+    onnx_inp_names = tuple(
+        [torch_input_names[i] for i in range(len(torch_input_names)) if isinstance(onnx_inputs[i], torch.Tensor)]
+    )
+    assert (
+        "input_ids" in onnx_inp_names and "attention_mask" in onnx_inp_names
+    ), "input_ids and attention_mask must be existed in inputs"
     onnx_out_names = ("logits",)
     onnx_dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_len"},
         "attention_mask": {0: "batch_size", 1: "seq_len"},
     }
+    # add dyanmic dimensions for the unkonw inputs
+    for idx, name in enumerate(onnx_inp_names):
+        if name not in onnx_dynamic_axes:
+            unknown_dims = {i: f"{idx}__unknown_dims__{i}" for i in range(onnx_inputs[idx].dim())}
+            onnx_dynamic_axes[name] = unknown_dims
     if input_with_past:
         for i in range(num_of_past_key):
-            onnx_inp_names += (f"present_key.{i}",)
-            onnx_inp_names += (f"present_values.{i}",)
+            onnx_inp_names += (f"past_key_values.{i}.key",)
+            onnx_inp_names += (f"past_key_values.{i}.value",)
 
             onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
 
     if with_past or input_with_past:
         for i in range(num_of_past_key):
-            onnx_out_names += (f"past_key.{i}",)
-            onnx_out_names += (f"past_values.{i}",)
+            onnx_out_names += (f"present.{i}.key",)
+            onnx_out_names += (f"present.{i}.value",)
             onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
 

From 63dd605d3310f5a9540c414216f3f3b67d455c4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:00:36 +0100
Subject: [PATCH 10/15] Fix untyped float values in quantization tool missing
 from PR #18043 (#19182)

### Description
Extends the code coverage to Entroy, Histogram and Distribution
calibration method, fix bugs while doing it.


### Motivation and Context
Bugs detected in [Olive](https://github.com/microsoft/OLive).
---
 .../python/tools/quantization/calibrate.py    | 86 +++++++++++++++----
 .../python/tools/quantization/quant_utils.py  |  2 +-
 .../python/quantization/test_op_matmul.py     | 66 +++++++++++++-
 3 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index d0db57c392961..77b3dce9fb004 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,6 +5,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import copy
 import itertools
 import os
 import uuid
@@ -21,6 +22,48 @@
 from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution
 
 
+def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
+    """
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
+    Python implementation.
+    """
+    res = np.empty(pk.shape, dtype=pk.dtype)
+    res[:] = pk[:] * np.log(pk[:] / qk[:])
+    c2 = (pk == 0) & (qk >= 0)
+    res[c2] = 0
+    c1 = (pk > 0) & (qk > 0)
+    res[~c1] = np.inf
+    return res
+
+
+def entropy(
+    pk: np.ndarray,
+    qk: np.ndarray,
+    base: Optional[float] = None,
+    axis: int = 0,
+) -> np.ndarray:
+    """
+    Simplifeied version of entropy.
+    Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
+    This avoids taking a dependency on scipy just for this function.
+    """
+    assert base is None or base > 0, "base={base} must be a positive number or `None`."
+    assert qk is not None, "qk is None"
+
+    pk = np.asarray(pk).astype(np.float32)
+    pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)
+
+    qk = np.asarray(qk).astype(np.float32)
+    pk, qk = np.broadcast_arrays(pk, qk)
+    qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
+    vec = rel_entr(pk, qk)
+
+    s = np.sum(vec, axis=axis)
+    if base is not None:
+        s /= np.log(base)
+    return s.astype(pk.dtype)
+
+
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
     _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
@@ -708,8 +751,8 @@ def collect_absolute_value(self, name_to_arr):
                 min_value = np.min(data_arr_np)
                 max_value = np.max(data_arr_np)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr_np.dtype)
+                max_value = np.array(0, dtype=data_arr_np.dtype)
 
             data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
@@ -725,6 +768,8 @@ def collect_absolute_value(self, name_to_arr):
                 old_histogram = self.histogram_dict[tensor]
                 old_min = old_histogram[2]
                 old_max = old_histogram[3]
+                assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
+                assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
                 temp_amax = np.max(data_arr_np)
@@ -757,7 +802,7 @@ def collect_value(self, name_to_arr):
                 min_value = np.array(0, dtype=data_arr.dtype)
                 max_value = np.array(0, dtype=data_arr.dtype)
 
-            threshold = max(abs(min_value), abs(max_value))
+            threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)
 
             if tensor in self.histogram_dict:
                 old_histogram = self.histogram_dict[tensor]
@@ -809,7 +854,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
+        print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -938,7 +983,14 @@ def compute_distribution(self):
             assert avg_coef.dtype != np.float64
             assert std_coef.dtype != np.float64
             assert hist_edges.dtype != np.float64
-            thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
+            thresholds_dict[tensor] = TensorData(
+                avg=avg_coef,
+                std=std_coef,
+                hist=hist,
+                hist_edges=hist_edges,
+                lowest=hist_edges.min(),
+                highest=hist_edges.max(),
+            )
 
             # Plot histogram for debug only
             if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
@@ -952,18 +1004,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
         `q` is a truncated version of the original distribution.
         Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
         """
-        import copy
-
-        from scipy.stats import entropy
-
         hist = histogram[0]
         hist_edges = histogram[1]
         num_bins = hist.size
         zero_bin_index = num_bins // 2
         num_half_quantized_bin = num_quantized_bins // 2
 
+        dtype = histogram[1].dtype
         kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
-        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+        thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]
 
         # <------------ num bins ---------------->
         #        <--- quantized bins ---->
@@ -983,10 +1032,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             start_index = zero_bin_index - i
             end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
 
-            thresholds[i - num_half_quantized_bin] = (
-                float(hist_edges[start_index]),
-                float(hist_edges[end_index]),
-            )
+            thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
             sliced_distribution = copy.deepcopy(hist[start_index:end_index])
 
@@ -1020,15 +1066,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
                 norm = sum(nonzeros[start:end])
                 if norm != 0:
-                    q[start:end] = float(quantized_bins[index]) / float(norm)
+                    q[start:end] = quantized_bins[index] / norm
 
             p = smooth_distribution(p)
             q = smooth_distribution(q)
-
-            if isinstance(q, np.ndarray):
-                kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
+            if p is None or q is None:
+                div = np.array(np.inf, dtype=dtype)
             else:
-                kl_divergence[i - num_half_quantized_bin] = float("inf")
+                div = np.array(entropy(p, q), dtype=dtype)
+            kl_divergence[i - num_half_quantized_bin] = div
 
         min_kl_divergence_idx = np.argmin(kl_divergence)
         optimal_threshold = thresholds[min_kl_divergence_idx]
@@ -1038,6 +1084,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             optimal_threshold = (min_value, optimal_threshold[1])
         if optimal_threshold[1] > max_value:
             optimal_threshold = (optimal_threshold[0], max_value)
+        assert hasattr(optimal_threshold[0], "dtype")
+        assert hasattr(optimal_threshold[1], "dtype")
         return optimal_threshold
 
 
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 68c2b3bf79c8b..036f49b420734 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -653,7 +653,7 @@ def smooth_distribution(p, eps=0.0001):
 
     if not n_nonzeros:
         # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-        return -1
+        return None
     eps1 = eps * float(n_zeros) / float(n_nonzeros)
     assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
         n_zeros,
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
index 344583aa7c624..91368bd643158 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -10,13 +10,39 @@
 import numpy as np
 import onnx
 import packaging.version as pv
+from numpy.testing import assert_almost_equal
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization.calibrate import entropy
+
+
+def skip_if_new_opset_exception_raised(func):
+    def wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except Fail as e:
+            if "is under development and support for this is limited" in str(e):
+                raise unittest.SkipTest(f"Skipped {func} due to opset under development.")  # noqa: B904
+            raise
+
+    return wrapper
 
 
 class TestOpMatMul(unittest.TestCase):
+    def test_entropy(self):
+        try:
+            from scipy.stats import entropy as scipy_entropy
+        except ImportError:
+            raise unittest.SkipTest("scipy not installed.")  # noqa: B904
+        pk = (np.arange(10) - 5).astype(np.float32) / 10
+        qk = -(np.arange(10) - 5).astype(np.float32) / 10
+        ent = scipy_entropy(pk, qk)
+        get = entropy(pk, qk)
+        assert_almost_equal(ent, get)
+
     def input_feeds(self, n, name2shape, dtype):
         input_data_list = []
         for _i in range(n):
@@ -324,10 +350,11 @@ def test_quantize_matmul_u8u8(self):
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_u8u8_f16(self):
-        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)
 
-    def quantize_matmul_s8s8(self, tt, opset, ir_version):
+    def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
         np.random.seed(1)
         model_fp_path = "matmul_fp.onnx"
         self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
@@ -341,6 +368,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
         self.static_quant_test_qdq(
             model_fp_path,
@@ -348,6 +376,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
 
         # dynamic quantization doesn't support activation:int8
@@ -357,11 +386,42 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
     def test_quantize_matmul_s8s8(self):
         self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
 
+    def test_quantize_matmul_s8s8_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)
+
+    def test_quantize_matmul_s8s8_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)
+
+    def test_quantize_matmul_s8s8_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)
+
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_s8s8_f16(self):
-        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)
 
     def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
         np.random.seed(1)

From bd9d8fb2a545a59d87a4c23308ec543ba6e4c41d Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:18:32 -0800
Subject: [PATCH 11/15] [ORT 1.17.0 release] Bump up version to 1.18.0 (#19170)

### Description
<!-- Describe your changes. -->

Bump up version to 1.18.0 since the release branch has been cut.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 VERSION_NUMBER                                            | 2 +-
 .../Training/NativeTrainingMethods.shared.cs              | 4 ++--
 docs/python/README.rst                                    | 5 +++++
 include/onnxruntime/core/session/onnxruntime_c_api.h      | 2 +-
 js/common/lib/version.ts                                  | 2 +-
 js/common/package-lock.json                               | 4 ++--
 js/common/package.json                                    | 2 +-
 js/node/lib/version.ts                                    | 2 +-
 js/node/package-lock.json                                 | 6 +++---
 js/node/package.json                                      | 2 +-
 js/react_native/lib/version.ts                            | 2 +-
 js/react_native/package.json                              | 2 +-
 js/react_native/yarn.lock                                 | 2 +-
 js/web/lib/version.ts                                     | 2 +-
 js/web/package-lock.json                                  | 6 +++---
 js/web/package.json                                       | 2 +-
 onnxruntime/__init__.py                                   | 2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc             | 8 ++++----
 18 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 092afa15df4df..84cc529467b05 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.17.0
+1.18.0
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
index 68a399f8b9671..7fe16f4156ef2 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
@@ -65,10 +65,10 @@ static NativeTrainingMethods()
                 DOrtGetApi OrtGetApi = (DOrtGetApi)Marshal.GetDelegateForFunctionPointer(NativeMethods.OrtGetApiBase().GetApi, typeof(DOrtGetApi));
 
                 // TODO: Make this save the pointer, and not copy the whole structure across
-                api_ = (OrtApi)OrtGetApi(17 /*ORT_API_VERSION*/);
+                api_ = (OrtApi)OrtGetApi(18 /*ORT_API_VERSION*/);
 
                 OrtGetTrainingApi = (DOrtGetTrainingApi)Marshal.GetDelegateForFunctionPointer(api_.GetTrainingApi, typeof(DOrtGetTrainingApi));
-                trainingApiPtr = OrtGetTrainingApi(17 /*ORT_API_VERSION*/);
+                trainingApiPtr = OrtGetTrainingApi(18 /*ORT_API_VERSION*/);
                 if (trainingApiPtr != IntPtr.Zero)
                 {
                     trainingApi_ = (OrtTrainingApi)Marshal.PtrToStructure(trainingApiPtr, typeof(OrtTrainingApi));
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 32bb3729e01d0..bbc8571fe3f17 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.18.0
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.18.0
+
 1.17.0
 ^^^^^^
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b321b2b2bac27..aca9f4896fbdb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -38,7 +38,7 @@
  *
  * This value is used by some API functions to behave as this version of the header expects.
  */
-#define ORT_API_VERSION 17
+#define ORT_API_VERSION 18
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 84f6dba83fa59..a5ada877b916a 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/common/package.json b/js/common/package.json
index beab7d29be263..64ab2736adbe3 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 542eebe746d59..2d7c39c86097f 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "os": [
         "win32",
@@ -27,7 +27,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/node/package.json b/js/node/package.json
index 8e591d8f46b9d..026840742e29e 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.17.0",
+  "version": "1.18.0",
   "dependencies": {
     "onnxruntime-common": "file:../common"
   },
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 39e6cb08bb06a..47324a76fe55f 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index ff9be7fbe3a5b..4dca90d7415cf 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.17.0"
+  version "1.18.0"
 
 open@^6.2.0:
   version "6.4.0"
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index cd71c20ba4d2f..1815767fd2320 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^1.12.0",
@@ -49,7 +49,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/web/package.json b/js/web/package.json
index 7ffc9ba16aaa9..aa89606c00a1e 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -8,7 +8,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 57219c50f39aa..c3699f0fb33ad 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.17.0"
+__version__ = "1.18.0"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index d77c188f832a7..91a7f0d930b51 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2397,7 +2397,7 @@ Second example, if we wanted to add and remove some members, we'd do this:
     In GetApi we now make it return ort_api_3 for version 3.
 */
 
-static constexpr OrtApi ort_api_1_to_17 = {
+static constexpr OrtApi ort_api_1_to_18 = {
     // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering.
 
     // Shipped as version 1 - DO NOT MODIFY (see above text for more information)
@@ -2756,16 +2756,16 @@ static_assert(offsetof(OrtApi, KernelContext_GetResource) / sizeof(void*) == 265
 static_assert(offsetof(OrtApi, SetUserLoggingFunction) / sizeof(void*) == 266, "Size of version 17 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.17.0",
+static_assert(std::string_view(ORT_VERSION) == "1.18.0",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
-// 2. If there were any APIs added to ort_api_1_to_17 above:
+// 2. If there were any APIs added to ort_api_1_to_18 above:
 //    a. Add the 'End of version #' markers (pattern above should be obvious)
 //    b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version
 
 ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) {
   if (version >= 1 && version <= ORT_API_VERSION)
-    return &ort_api_1_to_17;
+    return &ort_api_1_to_18;
 
   fprintf(stderr,
           "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build."

From bc219ed553fc8d4b8fa3c7b4476810a63a864d8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Wed, 17 Jan 2024 20:33:34 +0100
Subject: [PATCH 12/15] [TensorRT EP] Enable a minimal CUDA EP compilation
 without kernels  (#19052)

Adresses https://github.com/microsoft/onnxruntime/issues/18542.
I followed the advice given by @RyanUnderhill
[here](https://github.com/microsoft/onnxruntime/pull/18731#issuecomment-1848261925)
and went with a minimal CUDA EP for now.
---
 cmake/CMakeLists.txt                          |  1 +
 cmake/onnxruntime_providers_cuda.cmake        | 49 ++++++++++++++-----
 .../core/providers/cuda/cuda_context.h        |  3 +-
 onnxruntime/core/providers/cuda/cuda_call.cc  |  4 ++
 .../core/providers/cuda/cuda_common.cc        | 42 ++++++++--------
 onnxruntime/core/providers/cuda/cuda_common.h |  6 ++-
 .../providers/cuda/cuda_execution_provider.cc | 14 +++++-
 onnxruntime/core/providers/cuda/cuda_pch.h    |  7 +++
 .../core/providers/cuda/cuda_stream_handle.cc |  4 ++
 .../core/providers/cuda/cudnn_common.cc       |  3 +-
 .../core/providers/cuda/cudnn_common.h        |  3 +-
 11 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc96218dac79e..712d5d76108aa 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -79,6 +79,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 84d1376f99d5e..9887d615c92d7 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -1,10 +1,25 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
-  )
+
+  if (onnxruntime_CUDA_MINIMAL)
+    file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc"
+    )
+    # Remove pch files
+    list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h"
+    )
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+    )
+  endif()
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -16,11 +31,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
-  )
 
+
+  if (onnxruntime_CUDA_MINIMAL)
+    set(onnxruntime_providers_cuda_shared_srcs "")
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
+    )
+  endif()
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 
@@ -156,10 +176,15 @@
     endif()
 
     add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
-    target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
-    if(onnxruntime_CUDNN_HOME)
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+    if(onnxruntime_CUDA_MINIMAL)
+      target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
+      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+    else()
+      target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      if(onnxruntime_CUDNN_HOME)
+          target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+          target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+      endif()
     endif()
 
     if (onnxruntime_USE_TRITON_KERNEL)
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 9416fad5f1448..1370f5c4c5e10 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -16,9 +16,10 @@
 #include "core/providers/custom_op_context.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cudnn.h>
-
+#endif
 namespace Ort {
 
 namespace Custom {
diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index 4f223041e04e3..f60684795a4bc 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -30,6 +30,7 @@ const char* CudaErrString<cudaError_t>(cudaError_t x) {
   return cudaGetErrorString(x);
 }
 
+#ifndef USE_CUDA_MINIMAL
 template <>
 const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
   cudaDeviceSynchronize();
@@ -76,6 +77,7 @@ const char* CudaErrString<cufftResult>(cufftResult e) {
       return "Unknown cufft error status";
   }
 }
+#endif
 
 #ifdef ORT_USE_NCCL
 template <>
@@ -132,6 +134,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
 
 template Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+#ifndef USE_CUDA_MINIMAL
 template Status CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg, const char* file, const int line);
@@ -140,6 +143,7 @@ template Status CudaCall<curandStatus_t, false>(curandStatus_t retCode, const ch
 template void CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
+#endif
 
 #ifdef ORT_USE_NCCL
 template Status CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg, const char* file, const int line);
diff --git a/onnxruntime/core/providers/cuda/cuda_common.cc b/onnxruntime/core/providers/cuda/cuda_common.cc
index 33f2938940e4d..65083f89f7f77 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.cc
+++ b/onnxruntime/core/providers/cuda/cuda_common.cc
@@ -14,6 +14,27 @@ namespace cuda {
 //   0x04 - pedantic
 constexpr const char* kCudaGemmOptions = "ORT_CUDA_GEMM_OPTIONS";
 
+const char* CudaDataTypeToString(cudaDataType_t dt) {
+  switch (dt) {
+    case CUDA_R_16F:
+      return "CUDA_R_16F";
+    case CUDA_R_16BF:
+      return "CUDA_R_16BF";
+    case CUDA_R_32F:
+      return "CUDA_R_32F";
+#if !defined(DISABLE_FLOAT8_TYPES)
+    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
+    case CUDA_R_8F_E4M3:
+      return "CUDA_R_8F_E4M3";
+    case CUDA_R_8F_E5M2:
+      return "CUDA_R_8F_E5M2";
+#endif
+    default:
+      return "<unknown>";
+  }
+}
+
+#ifndef USE_CUDA_MINIMAL
 // Initialize the singleton instance
 HalfGemmOptions HalfGemmOptions::instance;
 
@@ -54,26 +75,6 @@ const char* cublasGetErrorEnum(cublasStatus_t error) {
   }
 }
 
-const char* CudaDataTypeToString(cudaDataType_t dt) {
-  switch (dt) {
-    case CUDA_R_16F:
-      return "CUDA_R_16F";
-    case CUDA_R_16BF:
-      return "CUDA_R_16BF";
-    case CUDA_R_32F:
-      return "CUDA_R_32F";
-#if !defined(DISABLE_FLOAT8_TYPES)
-    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
-    case CUDA_R_8F_E4M3:
-      return "CUDA_R_8F_E4M3";
-    case CUDA_R_8F_E5M2:
-      return "CUDA_R_8F_E5M2";
-#endif
-    default:
-      return "<unknown>";
-  }
-}
-
 const char* CublasComputeTypeToString(cublasComputeType_t ct) {
   switch (ct) {
     case CUBLAS_COMPUTE_16F:
@@ -92,6 +93,7 @@ const char* CublasComputeTypeToString(cublasComputeType_t ct) {
       return "<unknown>";
   }
 }
+#endif
 
 // It must exist somewhere already.
 cudaDataType_t ToCudaDataType(int32_t element_type) {
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 707099bac3ce0..e9941ce743bc3 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -22,13 +22,14 @@ namespace onnxruntime {
 namespace cuda {
 
 #define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+#ifndef USE_CUDA_MINIMAL
 #define CUBLAS_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUBLAS_CALL(expr))
 #define CUSPARSE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUSPARSE_CALL(expr))
 #define CURAND_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CURAND_CALL(expr))
 #define CUDNN_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_CALL(expr))
 #define CUDNN2_RETURN_IF_ERROR(expr, m) ORT_RETURN_IF_ERROR(CUDNN_CALL2(expr, m))
 #define CUFFT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUFFT_CALL(expr))
-
+#endif
 // Type mapping for MLFloat16 to half
 template <typename T>
 class ToCudaType {
@@ -93,7 +94,7 @@ inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int6
   }
   return true;
 }
-
+#ifndef USE_CUDA_MINIMAL
 class CublasMathModeSetter {
  public:
   CublasMathModeSetter(const cudaDeviceProp& prop, cublasHandle_t handle, cublasMath_t mode) : handle_(handle) {
@@ -189,6 +190,7 @@ const char* cublasGetErrorEnum(cublasStatus_t error);
 const char* CudaDataTypeToString(cudaDataType_t dt);
 
 const char* CublasComputeTypeToString(cublasComputeType_t ct);
+#endif
 
 cudaDataType_t ToCudaDataType(int32_t element_type);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index f7b23f12e8193..644bcaaa24cd4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cuda/cuda_contrib_kernels.h"
 #endif
@@ -27,6 +28,7 @@
 #ifdef USE_TRITON_KERNEL
 #include "core/providers/cuda/triton_kernel.h"
 #endif
+#endif
 
 #include "core/providers/cuda/cuda_stream_handle.h"
 
@@ -169,21 +171,23 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
   CUDA_CALL_THROW(cudaSetDevice(device_id));
-
+#ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
   CUBLAS_CALL_THROW(cublasLtCreate(&cublas_lt_handle_));
   CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
 
   CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
   CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
-
+#endif
   cuda_graph_.SetStream(stream);
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
+#ifndef USE_CUDA_MINIMAL
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasDestroy(cublas_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasLtDestroy(cublas_lt_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUDNN_CALL(cudnnDestroy(cudnn_handle_)));
+#endif
 }
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
@@ -441,6 +445,7 @@ namespace cuda {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+#ifndef USE_CUDA_MINIMAL
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos);
@@ -1315,6 +1320,7 @@ class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
+#endif
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1326,6 +1332,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+#ifndef USE_CUDA_MINIMAL
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
@@ -2201,6 +2208,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2210,6 +2218,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     }
   }
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry));
 #endif
@@ -2220,6 +2229,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
 #ifdef ENABLE_TRAINING_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry));
+#endif
 #endif
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h
index f48554e8f1286..dfe50fe0a8832 100644
--- a/onnxruntime/core/providers/cuda/cuda_pch.h
+++ b/onnxruntime/core/providers/cuda/cuda_pch.h
@@ -10,12 +10,19 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
 #include <cudnn.h>
 #include <cufft.h>
 #include <cublasLt.h>
+#else
+typedef void* cudnnHandle_t;
+typedef void* cublasHandle_t;
+typedef void* cublasLtHandle_t;
+#endif
 
 #ifdef ORT_USE_NCCL
 #include <nccl.h>
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 7c866395ecf6e..0a256394b7d99 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -69,6 +69,7 @@ CudaStream::CudaStream(cudaStream_t stream,
                                                                    release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
                                                                    deferred_cpu_allocator_(*this),
                                                                    ep_info_(ep_info) {
+#ifndef USE_CUDA_MINIMAL
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -80,10 +81,12 @@ CudaStream::CudaStream(cudaStream_t stream,
     cudnn_handle_ = external_cudnn_handle;
     CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
   }
+#endif
 }
 
 CudaStream::~CudaStream() {
   ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd());
+#ifndef USE_CUDA_MINIMAL
   if (own_stream_) {
     cublasDestroy(cublas_handle_);
     cudnnDestroy(cudnn_handle_);
@@ -91,6 +94,7 @@ CudaStream::~CudaStream() {
     if (handle)
       cudaStreamDestroy(static_cast<cudaStream_t>(handle));
   }
+#endif
 }
 
 std::unique_ptr<synchronize::Notification> CudaStream::CreateNotification(size_t /*num_consumers*/) {
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 4df59a98b12e5..c850f7b583bfc 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -9,7 +9,7 @@
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
 #include "core/providers/cpu/tensor/utils.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -222,3 +222,4 @@ const Float8E5M2 Consts<Float8E5M2>::One = Float8E5M2(1.0f, true);
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index 8a94a334ee688..fdd14dedad47e 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -7,7 +7,7 @@
 #include <cfloat>
 
 #include "core/providers/cuda/cuda_common.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -260,3 +260,4 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc,
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif

From 146ebaf91e85185a0ac18c82bc69eba685ab9727 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:03:43 -0800
Subject: [PATCH 13/15] [js/web] allow proxy to load model with 1GB <= size <
 2GB (#19178)

### Description

allow proxy to load model with 1GB <= size < 2GB

resolves #19157.
---
 js/web/lib/wasm/wasm-utils-load-file.ts | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
index abe480a43c790..c6cdba2320bde 100644
--- a/js/web/lib/wasm/wasm-utils-load-file.ts
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -47,9 +47,19 @@ export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Pro
         }
         const reader = response.body.getReader();
 
-        // use WebAssembly Memory to allocate larger ArrayBuffer
-        const pages = Math.ceil(fileSize / 65536);
-        const buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+        let buffer;
+        try {
+          // try to create ArrayBuffer directly
+          buffer = new ArrayBuffer(fileSize);
+        } catch (e) {
+          if (e instanceof RangeError) {
+            // use WebAssembly Memory to allocate larger ArrayBuffer
+            const pages = Math.ceil(fileSize / 65536);
+            buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+          } else {
+            throw e;
+          }
+        }
 
         let offset = 0;
         // eslint-disable-next-line no-constant-condition

From f87e69801f200a34ddb312f1d39e7296f19b660b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:04:22 -0800
Subject: [PATCH 14/15] [js/web] show warning when numThreads is set but
 threads is not supported (#19179)

### Description
show warning when numThreads is set but threads is not supported.
Resolves #19148, #18933

for web: when crossOriginIsolated is false.
for node: always disable.
---
 js/web/lib/backend-wasm.ts      |  6 ++++++
 js/web/lib/wasm/wasm-factory.ts | 33 +++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index d9f63fec9c492..31ecffb07e40c 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -31,6 +31,12 @@ export const initializeFlags = (): void => {
   }
 
   if (typeof env.wasm.numThreads !== 'number' || !Number.isInteger(env.wasm.numThreads) || env.wasm.numThreads <= 0) {
+    // Web: when crossOriginIsolated is false, SharedArrayBuffer is not available so WebAssembly threads will not work.
+    // Node.js: onnxruntime-web does not support multi-threads in Node.js.
+    if ((typeof self !== 'undefined' && !self.crossOriginIsolated) ||
+        (typeof process !== 'undefined' && process.versions && process.versions.node)) {
+      env.wasm.numThreads = 1;
+    }
     const numCpuLogicalCores = typeof navigator === 'undefined' ? cpus().length : navigator.hardwareConcurrency;
     env.wasm.numThreads = Math.min(4, Math.ceil((numCpuLogicalCores || 1) / 2));
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 81508a253ce8b..9b9334c93b78c 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -28,13 +28,34 @@ let initialized = false;
 let initializing = false;
 let aborted = false;
 
-const isMultiThreadSupported = (): boolean => {
-  try {
-    // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
-    if (typeof SharedArrayBuffer === 'undefined') {
-      return false;
+const isMultiThreadSupported = (numThreads: number): boolean => {
+  // WebAssembly threads are set to 1 (single thread).
+  if (numThreads === 1) {
+    return false;
+  }
+
+  // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
+  if (typeof SharedArrayBuffer === 'undefined') {
+    if (typeof self !== 'undefined' && !self.crossOriginIsolated) {
+      // eslint-disable-next-line no-console
+      console.warn(
+          'env.wasm.numThreads is set to ' + numThreads +
+          ', but this will not work unless you enable crossOriginIsolated mode. ' +
+          'See https://web.dev/cross-origin-isolation-guide/ for more info.');
     }
+    return false;
+  }
+
+  // onnxruntime-web does not support multi-threads in Node.js.
+  if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+    // eslint-disable-next-line no-console
+    console.warn(
+        'env.wasm.numThreads is set to ' + numThreads +
+        ', however, currently onnxruntime-web does not support multi-threads in Node.js. ' +
+        'Please consider using onnxruntime-node for performance critical scenarios.');
+  }
 
+  try {
     // Test for transferability of SABs (for browsers. needed for Firefox)
     // https://groups.google.com/forum/#!msg/mozilla.dev.platform/IHkBZlHETpA/dwsMNchWEQAJ
     if (typeof MessageChannel !== 'undefined') {
@@ -106,7 +127,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const numThreads = flags.numThreads!;
   const simd = flags.simd!;
 
-  const useThreads = numThreads > 1 && isMultiThreadSupported();
+  const useThreads = isMultiThreadSupported(numThreads);
   const useSimd = simd && isSimdSupported();
 
   const wasmPaths = flags.wasmPaths;

From 9da3e36138dd24377fbb0b4022d891b3baf07b84 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 17 Jan 2024 20:20:42 -0500
Subject: [PATCH 15/15] Fix buildJava from Zip-Nuget-Java-Nodejs Packaging
 Pipeline (#19187)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml             |  2 ++
 .../stages/nuget-linux-cuda-packaging-stage.yml        | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 3803333bd880a..aa1a75bfcda45 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -204,6 +204,8 @@ stages:
     CudaVersion: ${{ parameters.CudaVersion }}
     docker_base_image: ${{ variables.docker_base_image }}
     linux_trt_version: ${{ variables.linux_trt_version }}
+    buildJava: true
+    buildNodejs: true
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index dbbc9ef27e513..db9bcacbf0754 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -6,6 +6,12 @@ parameters:
   type: string
 - name: linux_trt_version
   type: string
+- name: buildJava
+  type: boolean
+  default: false
+- name: buildNodejs
+  type: boolean
+  default: false
 
 stages:
   # Linux CUDA without TensorRT Packaging
@@ -66,9 +72,9 @@ stages:
   parameters:
     artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
     artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-    buildJava: false
+    buildJava: ${{ parameters.buildJava }}
     buildJavaOption: '--build_java'
-    buildNodejs: false
+    buildNodejs: ${{ parameters.buildNodejs }}
     buildNodejsOption: '--build_nodejs'
     CudaVersion: ${{ parameters.CudaVersion }}
 # Linux CUDA Combined Testing and Publishing