diff --git a/helpers/configuration/cmd_args.py b/helpers/configuration/cmd_args.py index 2589371f..f8cd4702 100644 --- a/helpers/configuration/cmd_args.py +++ b/helpers/configuration/cmd_args.py @@ -1336,6 +1336,17 @@ def get_argument_parser(): default=1e-08, help="Epsilon value for the Adam optimizer", ) + parser.add_argument( + "--prodigy_steps", + type=int, + default=None, + help=( + "When training with Prodigy, this defines how many steps it should be adjusting its learning rate for." + " It seems to be that Diffusion models benefit from a capping off of the adjustments after 25 percent" + " of the training run (dependent on batch size, repeats, and epochs)." + " It this value is not supplied, it will be calculated at 25 percent of your training steps." + ), + ) parser.add_argument( "--max_grad_norm", default=2.0, diff --git a/helpers/training/optimizer_param.py b/helpers/training/optimizer_param.py index d2863c7e..e8541130 100644 --- a/helpers/training/optimizer_param.py +++ b/helpers/training/optimizer_param.py @@ -61,6 +61,18 @@ if "AdEMAMix" in dir(bitsandbytes.optim): is_ademamix_available = True +is_prodigy_available = False +try: + import prodigyplus + + is_prodigy_available = True +except: + if torch.cuda.is_available(): + logger.warning( + "Could not load prodigyplus library. Prodigy will not be available." + ) + + optimizer_choices = { "adamw_bf16": { "precision": "bf16", @@ -456,6 +468,42 @@ } ) +if is_prodigy_available: + optimizer_choices.update( + { + "prodigy": { + "precision": "any", + "override_lr_scheduler": True, + "can_warmup": False, + "default_settings": { + "lr": 1.0, + "betas": (0.9, 0.99), + "beta3": None, + "weight_decay": 0.0, + "weight_decay_by_lr": True, + "use_bias_correction": False, + "d0": 1e-6, + "d_coef": 1, + "prodigy_steps": 0, + "use_speed": False, + "eps": 1e-8, + "split_groups": True, + "split_groups_mean": True, + "factored": True, + "factored_fp32": True, + "fused_back_pass": False, + "use_stableadamw": True, + "use_muon_pp": False, + "use_cautious": False, + "use_grams": False, + "use_adopt": False, + "stochastic_rounding": True, + }, + "class": prodigyplus.prodigy_plus_schedulefree.ProdigyPlusScheduleFree, + } + } + ) + args_to_optimizer_mapping = { "use_adafactor_optimizer": "adafactor", "use_prodigy_optimizer": "prodigy", @@ -465,7 +513,6 @@ } deprecated_optimizers = { - "prodigy": "Prodigy optimiser has been removed due to issues with precision levels and convergence. Please use adamw_schedulefree instead.", "dadaptation": "D-adaptation optimiser has been removed due to issues with precision levels and convergence. Please use adamw_schedulefree instead.", "adafactor": "Adafactor optimiser has been removed in favour of optimi-stableadamw, which offers improved memory efficiency and convergence.", "adamw8bit": "AdamW8Bit has been removed in favour of optimi-adamw optimiser, which offers better low-precision support. Please use this or adamw_bf16 instead.", @@ -512,6 +559,16 @@ def optimizer_parameters(optimizer, args): if args.optimizer_release_gradients and "optimi-" in optimizer: optimizer_params["gradient_release"] = True optimizer_details["default_settings"] = optimizer_params + if args.optimizer == "prodigy": + prodigy_steps = args.prodigy_steps + if prodigy_steps and prodigy_steps > 0: + optimizer_params["prodigy_steps"] = prodigy_steps + else: + # 25% of the total number of steps + optimizer_params["prodigy_steps"] = int(args.max_train_steps * 0.25) + print( + f"Using Prodigy optimiser with {optimizer_params['prodigy_steps']} steps of learning rate adjustment." + ) return optimizer_class, optimizer_details else: raise ValueError(f"Optimizer {optimizer} not found.") diff --git a/helpers/training/trainer.py b/helpers/training/trainer.py index a492c2af..e864ee4c 100644 --- a/helpers/training/trainer.py +++ b/helpers/training/trainer.py @@ -1232,9 +1232,7 @@ def init_optimizer(self): def init_lr_scheduler(self): self.config.is_schedulefree = is_lr_scheduler_disabled(self.config.optimizer) if self.config.is_schedulefree: - logger.info( - "Using experimental AdamW ScheduleFree optimiser from Facebook. Experimental due to newly added Kahan summation." - ) + logger.info("Using experimental ScheduleFree optimiser..") # we don't use LR schedulers with schedulefree optimisers lr_scheduler = None if not self.config.use_deepspeed_scheduler and not self.config.is_schedulefree: @@ -2778,12 +2776,14 @@ def train(self): if param.grad is not None: param.grad.data = param.grad.data.to(torch.float32) + self.grad_norm = self._max_grad_value() if ( self.accelerator.sync_gradients - and self.config.optimizer != "optimi-stableadamw" + and self.config.optimizer + not in ["optimi-stableadamw", "prodigy"] and self.config.max_grad_norm > 0 ): - # StableAdamW does not need clipping, similar to Adafactor. + # StableAdamW/Prodigy do not need clipping, similar to Adafactor. if self.config.grad_clip_method == "norm": self.grad_norm = self.accelerator.clip_grad_norm_( self._get_trainable_parameters(), @@ -2793,7 +2793,6 @@ def train(self): # deepspeed can only do norm clipping (internally) pass elif self.config.grad_clip_method == "value": - self.grad_norm = self._max_grad_value() self.accelerator.clip_grad_value_( self._get_trainable_parameters(), self.config.max_grad_norm, @@ -2824,7 +2823,22 @@ def train(self): wandb_logs = {} if self.accelerator.sync_gradients: try: - if self.config.is_schedulefree: + if "prodigy" in self.config.optimizer: + self.lr = self.optimizer.param_groups[0]["d"] + wandb_logs.update( + { + "prodigy/d": self.optimizer.param_groups[0]["d"], + "prodigy/d_prev": self.optimizer.param_groups[0][ + "d_prev" + ], + "prodigy/d0": self.optimizer.param_groups[0]["d0"], + "prodigy/d_coef": self.optimizer.param_groups[0][ + "d_coef" + ], + "prodigy/k": self.optimizer.param_groups[0]["k"], + } + ) + elif self.config.is_schedulefree: # hackjob method of retrieving LR from accelerated optims self.lr = StateTracker.get_last_lr() else: @@ -2834,12 +2848,14 @@ def train(self): logger.error( f"Failed to get the last learning rate from the scheduler. Error: {e}" ) - wandb_logs = { - "train_loss": self.train_loss, - "optimization_loss": loss, - "learning_rate": self.lr, - "epoch": epoch, - } + wandb_logs.update( + { + "train_loss": self.train_loss, + "optimization_loss": loss, + "learning_rate": self.lr, + "epoch": epoch, + } + ) if parent_loss is not None: wandb_logs["regularisation_loss"] = parent_loss if self.config.model_family == "flux" and self.guidance_values_list: @@ -2850,7 +2866,7 @@ def train(self): if self.grad_norm is not None: if self.config.grad_clip_method == "norm": wandb_logs["grad_norm"] = self.grad_norm - elif self.config.grad_clip_method == "value": + else: wandb_logs["grad_absmax"] = self.grad_norm if self.validation is not None and hasattr( self.validation, "evaluation_result" diff --git a/install/apple/poetry.lock b/install/apple/poetry.lock index 428cfa2c..d9f03b79 100644 --- a/install/apple/poetry.lock +++ b/install/apple/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "absl-py" @@ -2153,6 +2153,20 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "prodigy-plus-schedule-free" +version = "1.8.51" +description = "Automatic learning rate optimiser based on Prodigy and Schedule-Free" +optional = false +python-versions = ">=3.4" +files = [ + {file = "prodigy_plus_schedule_free-1.8.51-py3-none-any.whl", hash = "sha256:db2e8e7443e8a8b83e412c8e6b81ac67c72135a8065c1dc001d781a9d2f5dbef"}, + {file = "prodigy_plus_schedule_free-1.8.51.tar.gz", hash = "sha256:19e5acb151c54f4326b11e6f308f57acbaa4fba5834bf3a32d759a5de11ba6e7"}, +] + +[package.dependencies] +torch = ">=2.0" + [[package]] name = "protobuf" version = "5.28.1" @@ -4259,4 +4273,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "9236f5c973de2929a0282d519155c82148153edfe35b8a475fcea30326f5bc7e" +content-hash = "cec13154617cd259501890e6ef65690ff13004afb76394ac44d27f7d93b0a2b5" diff --git a/install/apple/pyproject.toml b/install/apple/pyproject.toml index 7ee3b640..997e6718 100644 --- a/install/apple/pyproject.toml +++ b/install/apple/pyproject.toml @@ -47,6 +47,7 @@ torchao = "^0.7.0" torchaudio = "^2.5.0" atomicwrites = "^1.4.1" beautifulsoup4 = "^4.12.3" +prodigy-plus-schedule-free = "^1.8.51" [build-system] diff --git a/install/rocm/poetry.lock b/install/rocm/poetry.lock index 2d0d236f..062d4026 100644 --- a/install/rocm/poetry.lock +++ b/install/rocm/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "absl-py" @@ -1993,6 +1993,20 @@ docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx- test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] type = ["mypy (>=1.8)"] +[[package]] +name = "prodigy-plus-schedule-free" +version = "1.8.51" +description = "Automatic learning rate optimiser based on Prodigy and Schedule-Free" +optional = false +python-versions = ">=3.4" +files = [ + {file = "prodigy_plus_schedule_free-1.8.51-py3-none-any.whl", hash = "sha256:db2e8e7443e8a8b83e412c8e6b81ac67c72135a8065c1dc001d781a9d2f5dbef"}, + {file = "prodigy_plus_schedule_free-1.8.51.tar.gz", hash = "sha256:19e5acb151c54f4326b11e6f308f57acbaa4fba5834bf3a32d759a5de11ba6e7"}, +] + +[package.dependencies] +torch = ">=2.0" + [[package]] name = "protobuf" version = "4.25.4" @@ -4015,4 +4029,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "1a1142eddd00119973bb3ef488a3b1dc49416cfc3945b3f1f293b34f5b836d67" +content-hash = "4443d5137bc126d539a404a63d854ecd119c92aa8b2c0deb0892fb9d6f72f7af" diff --git a/install/rocm/pyproject.toml b/install/rocm/pyproject.toml index 51e46683..b1d2f474 100644 --- a/install/rocm/pyproject.toml +++ b/install/rocm/pyproject.toml @@ -47,6 +47,7 @@ bitsandbytes = "^0.44.1" atomicwrites = "^1.4.1" torchao = "^0.7.0" beautifulsoup4 = "^4.12.3" +prodigy-plus-schedule-free = "^1.8.51" [build-system] diff --git a/poetry.lock b/poetry.lock index 051c042b..04b746b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2729,6 +2729,20 @@ docs = ["sphinx (>=1.7.1)"] redis = ["redis"] tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)", "types-redis"] +[[package]] +name = "prodigy-plus-schedule-free" +version = "1.8.51" +description = "Automatic learning rate optimiser based on Prodigy and Schedule-Free" +optional = false +python-versions = ">=3.4" +files = [ + {file = "prodigy_plus_schedule_free-1.8.51-py3-none-any.whl", hash = "sha256:db2e8e7443e8a8b83e412c8e6b81ac67c72135a8065c1dc001d781a9d2f5dbef"}, + {file = "prodigy_plus_schedule_free-1.8.51.tar.gz", hash = "sha256:19e5acb151c54f4326b11e6f308f57acbaa4fba5834bf3a32d759a5de11ba6e7"}, +] + +[package.dependencies] +torch = ">=2.0" + [[package]] name = "propcache" version = "0.2.0" @@ -5359,4 +5373,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "c4655978cc7d793ef49f1906503b3ee53e1d625c473740cb982aabac16fe9929" +content-hash = "0d8538a573aada976d0ae6ec34b7efa6328a7c7185188362482e1fa93ae72c53" diff --git a/pyproject.toml b/pyproject.toml index fc529e8f..fa176b44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ nvidia-cudnn-cu12 = "*" nvidia-nccl-cu12 = "*" atomicwrites = "^1.4.1" beautifulsoup4 = "^4.12.3" +prodigy-plus-schedule-free = "^1.8.51"