diff --git a/README.md b/README.md index d5b6fc77..74b4f555 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ __MS-AMP__ is an automatic mixed precision package for deep learning developed by Microsoft. -📢 [v0.3.0](https://github.com/Azure/MS-AMP/releases/tag/v0.3.0) has been released! +📢 [v0.4.0](https://github.com/Azure/MS-AMP/releases/tag/v0.4.0) has been released! ## _Check [aka.ms/msamp/doc](https://aka.ms/msamp/doc) for more details._ diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx index 83b6e783..6f394145 100644 --- a/docs/developer-guides/using-docker.mdx +++ b/docs/developer-guides/using-docker.mdx @@ -16,19 +16,19 @@ You need to [clone the code](./development.md#set-up) first before building the - + ```bash export DOCKER_BUILDKIT=1 docker buildx build \ --platform linux/amd64 --cache-to type=inline,mode=max \ - --tag msamp-dev-cuda121 --file dockerfile/torch2.1-cuda12.1.dockerfile . + --tag msamp-dev-cuda122 --file dockerfile/torch2.1-cuda12.2.dockerfile . ``` @@ -48,13 +48,13 @@ docker buildx build \ - + ```bash docker run \ @@ -62,7 +62,7 @@ docker run \ --privileged --net=host --ipc=host \ --gpus=all \ -w /root -v /mnt:/mnt \ - msamp-dev-cuda121 bash + msamp-dev-cuda122 bash ``` diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index dfcf6dce..f7e8cc26 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -25,6 +25,8 @@ You can use MS-AMP image by `ghcr.io/azure/msamp:${tag}`, available tags are lis | Tag | Description | |-------------------|------------------------------------| +| v0.4.0-cuda12.2 | MS-AMP v0.4.0 with CUDA 12.2 | +| v0.4.0-cuda11.8 | MS-AMP v0.4.0 with CUDA 11.8 | | v0.3.0-cuda12.1 | MS-AMP v0.3.0 with CUDA 12.1 | | v0.3.0-cuda11.8 | MS-AMP v0.3.0 with CUDA 11.8 | | v0.2.0-cuda12.1 | MS-AMP v0.2.0 with CUDA 12.1 | diff --git a/msamp/__init__.py b/msamp/__init__.py index aa40b9bf..c0841e1c 100644 --- a/msamp/__init__.py +++ b/msamp/__init__.py @@ -100,6 +100,6 @@ def initialize(model, optimizer=None, opt_level='O1', use_te=False): # noqa: return cast_model, cast_optimizer -__version__ = '0.3.0' +__version__ = '0.4.0' __author__ = 'Microsoft' __all__ = ['clip_grad_norm_', 'initialize'] diff --git a/msamp/deepspeed/runtime/engine.py b/msamp/deepspeed/runtime/engine.py index 5c974558..d87b39e4 100644 --- a/msamp/deepspeed/runtime/engine.py +++ b/msamp/deepspeed/runtime/engine.py @@ -11,6 +11,7 @@ FP16, BFLOAT16, logger, DeepSpeedEngine, instrument_w_nvtx, log_dist, \ see_memory_usage, DummyOptim, DeepSpeedZeroOptimizer, DeepSpeedZeRoOffload, \ PipelineModule, ZeroStageEnum +from deepspeed.utils.timer import NoopTimer from deepspeed.moe.utils import is_moe_param from deepspeed.accelerator import get_accelerator @@ -191,7 +192,8 @@ def _configure_zero_optimizer(self, optimizer): ZeROOptimizer: zero optimizer. """ zero_stage = self.zero_optimization_stage() - timers = self.timers if self.wall_clock_breakdown() else None + timers = self.timers if self.wall_clock_breakdown() else NoopTimer() + model_dtype, gradient_accumulation_dtype = self.get_data_types() if optimizer is None: optimizer = DummyOptim(list(self.module.parameters())) @@ -232,6 +234,7 @@ def _configure_zero_optimizer(self, optimizer): clip_grad=self.gradient_clipping(), contiguous_gradients=contiguous_gradients, reduce_bucket_size=self.zero_reduce_bucket_size(), + use_multi_rank_bucket_allreduce=self.zero_multi_rank_bucket_allreduce(), allgather_bucket_size=self.zero_allgather_bucket_size(), dp_process_group=self.data_parallel_group, expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None, @@ -248,6 +251,7 @@ def _configure_zero_optimizer(self, optimizer): round_robin_gradients=round_robin_gradients, has_moe_layers=self.has_moe_layers, fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(), + gradient_accumulation_dtype=gradient_accumulation_dtype, communication_data_type=self.communication_data_type, elastic_checkpoint=self.zero_elastic_checkpoint() ) diff --git a/website/blog/2024-02-26-release-0-4.md b/website/blog/2024-02-26-release-0-4.md new file mode 100644 index 00000000..df31493b --- /dev/null +++ b/website/blog/2024-02-26-release-0-4.md @@ -0,0 +1,36 @@ +--- +slug: release-msamp-v0.4 +title: Releasing MS-AMP v0.4 +author: Yuxiang Yang +author_title: MS-AMP Team +author_url: https://github.com/tocean +tags: [MS-AMP, announcement, release] +--- + +We are very happy to announce that **MS-AMP 0.4.0 version** is officially released today! + +You can install and try MS-AMP by following [Getting Started Tutorial](https://azure.github.io/MS-AMP/docs/getting-started/installation). + +## MS-AMP 0.4.0 Release Notes + +### MS-AMP Improvements + +- Improve GPT-3 performance by optimizing the FP8-gradient accumulation with kernel fusion technology +- Support FP8 in FSDP +- Support DeepSpeed+TE+MSAMP and add cifar10 example +- Support MSAMP+TE+DDP +- Update DeepSpeed to latest version +- Update TransformerEngin to V1.1 and flash-attn to latest version +- Support CUDA 12.2 +- Fix several bugs in DeepSpeed integration + +### MS-AMP-Examples Improvements + +- Improve document for data processing in GPT3 +- Add launch script for pretraining GPT-6b7 +- Use new API of TransformerEngine in Megatron-LM + +### Document Improvements + +- Add docker usage in Installation page +- Tell customer how to run FSDP and DeepSpeed+TE+MSAMP example in "Run Examples" page diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index e5e1c5fe..28c601e3 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -91,7 +91,7 @@ module.exports = { announcementBar: { id: 'supportus', content: - '📢 v0.3.0 has been released! ' + + '📢 v0.4.0 has been released! ' + '⭐️ If you like MS-AMP, give it a star on GitHub! ⭐️', }, algolia: { diff --git a/website/package-lock.json b/website/package-lock.json index 926aed33..249c3d65 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,12 +1,12 @@ { "name": "msamp-website", - "version": "0.3.0", + "version": "0.4.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "msamp-website", - "version": "0.3.0", + "version": "0.4.0", "dependencies": { "@docusaurus/core": "2.0.0-beta.1", "@docusaurus/preset-classic": "2.0.0-beta.1", diff --git a/website/package.json b/website/package.json index fa1d7f68..784f4e40 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "msamp-website", - "version": "0.3.0", + "version": "0.4.0", "private": true, "scripts": { "docusaurus": "docusaurus",