Skip to content

Commit

Permalink
fix: retire gpu dedicated aks image (#5548)
Browse files Browse the repository at this point in the history
Co-authored-by: Zachary Bailey <[email protected]>
Co-authored-by: Zachary Bailey <[email protected]>
  • Loading branch information
3 people committed Jan 11, 2025
1 parent fc3fe29 commit 7f43bec
Show file tree
Hide file tree
Showing 6 changed files with 8 additions and 124 deletions.
62 changes: 0 additions & 62 deletions .pipelines/.vsts-vhd-builder-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,6 @@ parameters:
displayName: Build 1804 Gen2 containerd
type: boolean
default: true
- name: build1804gpucontainerd
displayName: Build 1804 GPU+containerd
type: boolean
default: true
- name: build1804gen2gpucontainerd
displayName: Build 1804 Gen2 GPU+containerd
type: boolean
default: true
- name: buildMarinerV2gen1
displayName: Build MarinerV2 Gen1
type: boolean
Expand Down Expand Up @@ -242,60 +234,6 @@ stages:
useOverrides: ${{ parameters.useOverrides }}
overrideBranch: ${{ parameters.overrideBranch }}
artifactName: 1804-gen2-containerd
- job: build1804gpucontainerd
condition: eq('${{ parameters.build1804gpucontainerd }}', true)
dependsOn: [ ]
timeoutInMinutes: 180
steps:
- bash: |
echo '##vso[task.setvariable variable=DRY_RUN]${{parameters.dryrun}}'
echo '##vso[task.setvariable variable=OS_SKU]Ubuntu'
echo '##vso[task.setvariable variable=OS_VERSION]18.04'
echo '##vso[task.setvariable variable=IMG_PUBLISHER]Canonical'
echo '##vso[task.setvariable variable=IMG_OFFER]UbuntuServer'
echo '##vso[task.setvariable variable=IMG_SKU]18.04-LTS'
echo '##vso[task.setvariable variable=IMG_VERSION]latest'
echo '##vso[task.setvariable variable=HYPERV_GENERATION]V1'
echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_NC4as_T4_v3'
echo '##vso[task.setvariable variable=FEATURE_FLAGS]fullgpudaemon'
echo '##vso[task.setvariable variable=CONTAINER_RUNTIME]containerd'
echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64'
echo '##vso[task.setvariable variable=ENABLE_FIPS]False'
echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False'
echo '##vso[task.setvariable variable=SGX_INSTALL]False'
displayName: Setup Build Variables
- template: ./templates/.builder-release-template.yaml
parameters:
useOverrides: ${{ parameters.useOverrides }}
overrideBranch: ${{ parameters.overrideBranch }}
artifactName: 1804-gpu-containerd
- job: build1804gen2gpucontainerd
condition: eq('${{ parameters.build1804gen2gpucontainerd }}', true)
dependsOn: [ ]
timeoutInMinutes: 180
steps:
- bash: |
echo '##vso[task.setvariable variable=DRY_RUN]${{parameters.dryrun}}'
echo '##vso[task.setvariable variable=OS_SKU]Ubuntu'
echo '##vso[task.setvariable variable=OS_VERSION]18.04'
echo '##vso[task.setvariable variable=IMG_PUBLISHER]Canonical'
echo '##vso[task.setvariable variable=IMG_OFFER]UbuntuServer'
echo '##vso[task.setvariable variable=IMG_SKU]18_04-LTS-GEN2'
echo '##vso[task.setvariable variable=IMG_VERSION]latest'
echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2'
echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_NC4as_T4_v3'
echo '##vso[task.setvariable variable=FEATURE_FLAGS]fullgpudaemon'
echo '##vso[task.setvariable variable=CONTAINER_RUNTIME]containerd'
echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64'
echo '##vso[task.setvariable variable=ENABLE_FIPS]False'
echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False'
echo '##vso[task.setvariable variable=SGX_INSTALL]True'
displayName: Setup Build Variables
- template: ./templates/.builder-release-template.yaml
parameters:
useOverrides: ${{ parameters.useOverrides }}
overrideBranch: ${{ parameters.overrideBranch }}
artifactName: 1804-gen2-gpu-containerd
- job: buildMarinerV2gen1
condition: eq('${{ parameters.buildMarinerV2gen1 }}', true)
dependsOn: [ ]
Expand Down
23 changes: 0 additions & 23 deletions .pipelines/.vsts-vhd-builder.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,6 @@ stages:
- template: ./templates/.builder-release-template.yaml
parameters:
artifactName: 2204-gen2-containerd
- job: build1804gen2gpucontainerd
timeoutInMinutes: 180
steps:
- bash: |
echo '##vso[task.setvariable variable=DRY_RUN]False'
echo '##vso[task.setvariable variable=OS_SKU]Ubuntu'
echo '##vso[task.setvariable variable=OS_VERSION]18.04'
echo '##vso[task.setvariable variable=IMG_PUBLISHER]Canonical'
echo '##vso[task.setvariable variable=IMG_OFFER]UbuntuServer'
echo '##vso[task.setvariable variable=IMG_SKU]18_04-LTS-GEN2'
echo '##vso[task.setvariable variable=IMG_VERSION]latest'
echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2'
echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_NC4as_T4_v3'
echo '##vso[task.setvariable variable=FEATURE_FLAGS]fullgpudaemon'
echo '##vso[task.setvariable variable=CONTAINER_RUNTIME]containerd'
echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64'
echo '##vso[task.setvariable variable=ENABLE_FIPS]false'
echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False'
echo '##vso[task.setvariable variable=SGX_INSTALL]True'
displayName: Setup Build Variables
- template: ./templates/.builder-release-template.yaml
parameters:
artifactName: 1804-gen2-gpu-containerd
- job: buildmarinerv2gen2
timeoutInMinutes: 180
steps:
Expand Down
1 change: 0 additions & 1 deletion .pipelines/templates/.builder-release-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ steps:
if [[ "${HYPERV_GENERATION,,}" == "v2" ]]; then SKU_NAME="${SKU_NAME}gen2"; fi && \
if [[ "${ARCHITECTURE,,}" == "arm64" ]]; then SKU_NAME="${SKU_NAME}arm64"; fi && \
if [[ "${ENABLE_FIPS,,}" == "true" ]]; then SKU_NAME="${SKU_NAME}fips"; fi && \
if [[ "$(FEATURE_FLAGS)" == *"fullgpu"* ]]; then SKU_NAME="${SKU_NAME}gpu"; fi && \
if [[ "${IMG_SKU}" == "20_04-lts-cvm" ]]; then SKU_NAME="${SKU_NAME}CVM"; fi && \
if [[ "${IMG_SKU}" == *"minimal"* ]]; then SKU_NAME="${SKU_NAME}minimal"; fi && \
if [[ "${ENABLE_TRUSTED_LAUNCH}" == "True" ]]; then SKU_NAME="${SKU_NAME}TL"; fi && \
Expand Down
10 changes: 8 additions & 2 deletions pkg/agent/datamodel/sig_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,12 @@ const (
// of support and image builds have stopped.
FrozenCBLMarinerV1SIGImageVersionForDeprecation string = "202308.28.0"

// DO NOT MODIFY: 1804GPUContainerd Gen1 & Gen2 pinned to the last image build as
// GPU Dedicated preview image is being deprecated and image builds have stopped.
Frozen1804GPUContainerdSIGImageVersionForDeprecation string = "202501.05.0"

Frozen1804Gen2GPUContainerdSIGImageVersionForDeprecation string = "202501.05.0"

// We do not use AKS Windows image versions in AgentBaker. These fake values are only used for unit tests.
Windows2019SIGImageVersion string = "17763.2019.221114"
Windows2022SIGImageVersion string = "20348.2022.221114"
Expand Down Expand Up @@ -470,14 +476,14 @@ var (
ResourceGroup: AKSUbuntuResourceGroup,
Gallery: AKSUbuntuGalleryName,
Definition: "1804gpucontainerd",
Version: LinuxSIGImageVersion,
Version: Frozen1804GPUContainerdSIGImageVersionForDeprecation,
}

SIGUbuntuGPUContainerd1804Gen2ImageConfigTemplate = SigImageConfigTemplate{
ResourceGroup: AKSUbuntuResourceGroup,
Gallery: AKSUbuntuGalleryName,
Definition: "1804gen2gpucontainerd",
Version: LinuxSIGImageVersion,
Version: Frozen1804Gen2GPUContainerdSIGImageVersionForDeprecation,
}

SIGUbuntuFipsContainerd1804ImageConfigTemplate = SigImageConfigTemplate{
Expand Down
29 changes: 0 additions & 29 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -435,23 +435,12 @@ if [[ $OS == $UBUNTU_OS_NAME && $(isARM64) != 1 ]]; then # No ARM64 SKU with GP

ctr -n k8s.io image pull "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG"

# Check for the "fullgpu" feature flag
if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then
bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
ret=$?
if [[ "$ret" != "0" ]]; then
echo "Failed to install GPU driver, exiting..."
exit $ret
fi
fi

cat << EOF >> ${VHD_LOGS_FILEPATH}
- nvidia-driver=${NVIDIA_DRIVER_IMAGE_TAG}
EOF

fi


ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH}

installBpftrace
Expand Down Expand Up @@ -541,25 +530,7 @@ NVIDIA_DEVICE_PLUGIN_VERSION="v0.14.5"

DEVICE_PLUGIN_CONTAINER_IMAGE="mcr.microsoft.com/oss/nvidia/k8s-device-plugin:${NVIDIA_DEVICE_PLUGIN_VERSION}"
pullContainerImage ${cliTool} ${DEVICE_PLUGIN_CONTAINER_IMAGE}

# GPU device plugin
if grep -q "fullgpu" <<< "$FEATURE_FLAGS" && grep -q "gpudaemon" <<< "$FEATURE_FLAGS"; then
kubeletDevicePluginPath="/var/lib/kubelet/device-plugins"
mkdir -p $kubeletDevicePluginPath
echo " - $kubeletDevicePluginPath" >> ${VHD_LOGS_FILEPATH}

DEST="/usr/local/nvidia/bin"
mkdir -p $DEST
ctr --namespace k8s.io run --rm --mount type=bind,src=${DEST},dst=${DEST},options=bind:rw --cwd ${DEST} $DEVICE_PLUGIN_CONTAINER_IMAGE plugingextract /bin/sh -c "cp /usr/bin/nvidia-device-plugin $DEST" || exit 1
chmod a+x $DEST/nvidia-device-plugin
echo " - extracted nvidia-device-plugin..." >> ${VHD_LOGS_FILEPATH}
ls -ltr $DEST >> ${VHD_LOGS_FILEPATH}

systemctlEnableAndStart nvidia-device-plugin || exit 1
ctr --namespace k8s.io images rm $DEVICE_PLUGIN_CONTAINER_IMAGE || exit 1
fi
fi

capture_benchmark "download_gpu_device_plugin"

mkdir -p /var/log/azure/Microsoft.Azure.Extensions.CustomScript/events
Expand Down
7 changes: 0 additions & 7 deletions vhdbuilder/packer/packer_source.sh
Original file line number Diff line number Diff line change
Expand Up @@ -348,13 +348,6 @@ copyPackerFiles() {
fi
fi

if grep -q "fullgpu" <<<"$FEATURE_FLAGS"; then
cpAndMode $NVIDIA_DOCKER_DAEMON_SRC $NVIDIA_DOCKER_DAEMON_DEST 644
if grep -q "gpudaemon" <<<"$FEATURE_FLAGS"; then
cpAndMode $NVIDIA_DEVICE_PLUGIN_SERVICE_SRC $NVIDIA_DEVICE_PLUGIN_SERVICE_DEST 644
fi
fi

cpAndMode $NOTICE_SRC $NOTICE_DEST 444

# Always copy the VHD cleanup script responsible for prepping the instance for first boot
Expand Down

0 comments on commit 7f43bec

Please sign in to comment.