From 8b8b13da9fec9e24266501d7de8c68464e61f1c3 Mon Sep 17 00:00:00 2001
From: malavhs <malavhs@amazon.com>
Date: Mon, 21 Oct 2024 00:59:39 +0000
Subject: [PATCH 1/5] Patch: hf-pt-train-1.13 for gevent and deepseed

---
 dlc_developer_config.toml                                   | 6 +++---
 .../pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu   | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 1f6a589ccc14..330b237539f3 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -34,11 +34,11 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = [huggingface_pytorch]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -107,7 +107,7 @@ dlc-pr-autogluon-training = ""
 
 # HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""
-dlc-pr-huggingface-pytorch-training = ""
+dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec1.13.yml"
 
 # Training Compiler
 dlc-pr-huggingface-pytorch-trcomp-training = ""
diff --git a/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu b/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu
index c27260c19ad5..3e8dcf26d7dc 100644
--- a/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu
+++ b/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu
@@ -26,8 +26,9 @@ RUN pip install --no-cache-dir \
     multiprocess==0.70.14 \
     dill==0.3.6 \
     sagemaker==2.227.0 \
+    deepspeed==0.15.2 \
     evaluate \
-    gevent~=23.9.0 \
+    gevent~=24.10.1 \
     pyarrow~=14.0.1
 
 RUN apt-get update \

From c028f910108dbf6d9129e211eddc0cd75b88eca0 Mon Sep 17 00:00:00 2001
From: malavhs <malavhs@amazon.com>
Date: Mon, 21 Oct 2024 01:24:00 +0000
Subject: [PATCH 2/5] fix typo

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 330b237539f3..100e1ec48e3b 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -34,7 +34,7 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = [huggingface_pytorch]
+build_frameworks = ["huggingface_pytorch"]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true

From c1ed4375d1cf45ee2cff6102a5e5d0af665303e8 Mon Sep 17 00:00:00 2001
From: malavhs <malavhs@amazon.com>
Date: Mon, 21 Oct 2024 01:31:38 +0000
Subject: [PATCH 3/5] fix typo

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 100e1ec48e3b..df5b0878c34b 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -107,7 +107,7 @@ dlc-pr-autogluon-training = ""
 
 # HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""
-dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec1.13.yml"
+dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec-1-13.yml"
 
 # Training Compiler
 dlc-pr-huggingface-pytorch-trcomp-training = ""

From 69ae0f205c09c8e86048c8ce69f6415da3501e76 Mon Sep 17 00:00:00 2001
From: malavhs <malavhs@amazon.com>
Date: Mon, 21 Oct 2024 04:11:21 +0000
Subject: [PATCH 4/5] bump dataset

---
 huggingface/pytorch/training/buildspec-1-13.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huggingface/pytorch/training/buildspec-1-13.yml b/huggingface/pytorch/training/buildspec-1-13.yml
index 7c10f0a23073..e512ce77279c 100644
--- a/huggingface/pytorch/training/buildspec-1-13.yml
+++ b/huggingface/pytorch/training/buildspec-1-13.yml
@@ -25,7 +25,7 @@ images:
     cuda_version: &CUDA_VERSION cu117
     os_version: &OS_VERSION ubuntu20.04
     transformers_version: &TRANSFORMERS_VERSION 4.26.0
-    datasets_version: &DATASETS_VERSION 2.16.1
+    datasets_version: &DATASETS_VERSION 3.0.1
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
                  *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,

From fc02ea0e6627f923c9a5431857c3bf24080ea68a Mon Sep 17 00:00:00 2001
From: malavhs <malavhs@amazon.com>
Date: Mon, 21 Oct 2024 05:24:33 +0000
Subject: [PATCH 5/5] Revert "bump dataset"

This reverts commit 69ae0f205c09c8e86048c8ce69f6415da3501e76.
---
 huggingface/pytorch/training/buildspec-1-13.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huggingface/pytorch/training/buildspec-1-13.yml b/huggingface/pytorch/training/buildspec-1-13.yml
index e512ce77279c..7c10f0a23073 100644
--- a/huggingface/pytorch/training/buildspec-1-13.yml
+++ b/huggingface/pytorch/training/buildspec-1-13.yml
@@ -25,7 +25,7 @@ images:
     cuda_version: &CUDA_VERSION cu117
     os_version: &OS_VERSION ubuntu20.04
     transformers_version: &TRANSFORMERS_VERSION 4.26.0
-    datasets_version: &DATASETS_VERSION 3.0.1
+    datasets_version: &DATASETS_VERSION 2.16.1
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
                  *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,