From 8b8b13da9fec9e24266501d7de8c68464e61f1c3 Mon Sep 17 00:00:00 2001 From: malavhs Date: Mon, 21 Oct 2024 00:59:39 +0000 Subject: [PATCH 1/5] Patch: hf-pt-train-1.13 for gevent and deepseed --- dlc_developer_config.toml | 6 +++--- .../pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1f6a589ccc14..330b237539f3 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,11 +34,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = [huggingface_pytorch] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -107,7 +107,7 @@ dlc-pr-autogluon-training = "" # HuggingFace Training dlc-pr-huggingface-tensorflow-training = "" -dlc-pr-huggingface-pytorch-training = "" +dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec1.13.yml" # Training Compiler dlc-pr-huggingface-pytorch-trcomp-training = "" diff --git a/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu b/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu index c27260c19ad5..3e8dcf26d7dc 100644 --- a/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu +++ b/huggingface/pytorch/training/docker/1.13/py3/cu117/Dockerfile.gpu @@ -26,8 +26,9 @@ RUN pip install --no-cache-dir \ multiprocess==0.70.14 \ dill==0.3.6 \ sagemaker==2.227.0 \ + deepspeed==0.15.2 \ evaluate \ - gevent~=23.9.0 \ + gevent~=24.10.1 \ pyarrow~=14.0.1 RUN apt-get update \ From c028f910108dbf6d9129e211eddc0cd75b88eca0 Mon Sep 17 00:00:00 2001 From: malavhs Date: Mon, 21 Oct 2024 01:24:00 +0000 Subject: [PATCH 2/5] fix typo --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 330b237539f3..100e1ec48e3b 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,7 +34,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [huggingface_pytorch] +build_frameworks = ["huggingface_pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true From c1ed4375d1cf45ee2cff6102a5e5d0af665303e8 Mon Sep 17 00:00:00 2001 From: malavhs Date: Mon, 21 Oct 2024 01:31:38 +0000 Subject: [PATCH 3/5] fix typo --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 100e1ec48e3b..df5b0878c34b 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -107,7 +107,7 @@ dlc-pr-autogluon-training = "" # HuggingFace Training dlc-pr-huggingface-tensorflow-training = "" -dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec1.13.yml" +dlc-pr-huggingface-pytorch-training = "huggingface/pytorch/training/buildspec-1-13.yml" # Training Compiler dlc-pr-huggingface-pytorch-trcomp-training = "" From 69ae0f205c09c8e86048c8ce69f6415da3501e76 Mon Sep 17 00:00:00 2001 From: malavhs Date: Mon, 21 Oct 2024 04:11:21 +0000 Subject: [PATCH 4/5] bump dataset --- huggingface/pytorch/training/buildspec-1-13.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface/pytorch/training/buildspec-1-13.yml b/huggingface/pytorch/training/buildspec-1-13.yml index 7c10f0a23073..e512ce77279c 100644 --- a/huggingface/pytorch/training/buildspec-1-13.yml +++ b/huggingface/pytorch/training/buildspec-1-13.yml @@ -25,7 +25,7 @@ images: cuda_version: &CUDA_VERSION cu117 os_version: &OS_VERSION ubuntu20.04 transformers_version: &TRANSFORMERS_VERSION 4.26.0 - datasets_version: &DATASETS_VERSION 2.16.1 + datasets_version: &DATASETS_VERSION 3.0.1 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, From fc02ea0e6627f923c9a5431857c3bf24080ea68a Mon Sep 17 00:00:00 2001 From: malavhs Date: Mon, 21 Oct 2024 05:24:33 +0000 Subject: [PATCH 5/5] Revert "bump dataset" This reverts commit 69ae0f205c09c8e86048c8ce69f6415da3501e76. --- huggingface/pytorch/training/buildspec-1-13.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface/pytorch/training/buildspec-1-13.yml b/huggingface/pytorch/training/buildspec-1-13.yml index e512ce77279c..7c10f0a23073 100644 --- a/huggingface/pytorch/training/buildspec-1-13.yml +++ b/huggingface/pytorch/training/buildspec-1-13.yml @@ -25,7 +25,7 @@ images: cuda_version: &CUDA_VERSION cu117 os_version: &OS_VERSION ubuntu20.04 transformers_version: &TRANSFORMERS_VERSION 4.26.0 - datasets_version: &DATASETS_VERSION 3.0.1 + datasets_version: &DATASETS_VERSION 2.16.1 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,