Merge branch 'main' into fix/shape-dimension-order

sinzlab · Mar 8, 2024 · 28aedec · 28aedec
2 parents 27d4b70 + 9b9ab51
commit 28aedec
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 75 deletions.
diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 
 from torch import nn
 
@@ -51,3 +52,41 @@ def __repr__(self):
         for attr in filter(lambda x: "gamma" in x or "skip" in x, dir(self)):
             ret.append(f"{attr} = {getattr(self, attr)}")
         return s + "|".join(ret) + "]\n"
+
+
+class ConvCore(Core):
+    def __init__(self) -> None:
+        """
+        Derived classes need to define "batch_norm", "hidden_channels", "momentum", "bias", "batch_norm_scale" attributes.
+        """
+        super().__init__()
+        self.set_batchnorm_type()
+
+    @abstractmethod
+    def set_batchnorm_type(self):
+        """
+        Set batchnorm_layer_cls, bias_layer_cls, scale_layer_cls class attributes
+        """
+        self.batchnorm_layer_cls = None
+        self.bias_layer_cls = None
+        self.scale_layer_cls = None
+
+    def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
+        for attr in ["batch_norm", "hidden_channels", "momentum", "bias", "batch_norm_scale"]:
+            if not hasattr(self, attr):
+                raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
+        for attr in ["batch_norm", "hidden_channels", "bias", "batch_norm_scale"]:
+            if not isinstance(getattr(self, attr), list):
+                raise ValueError(f"`{attr}` must be a list.")
+
+        if self.batch_norm[layer_idx]:
+            hidden_channels = self.hidden_channels[layer_idx]
+
+            bias = self.bias[layer_idx]
+            scale = self.batch_norm_scale[layer_idx]
+
+            layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum, affine=bias and scale)
+            if bias and not scale:
+                layer["bias"] = self.bias_layer_cls(hidden_channels)
+            elif not bias and scale:
+                layer["scale"] = self.scale_layer_cls(hidden_channels)
diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
@@ -1,7 +1,7 @@
 import logging
 import warnings
 from collections import OrderedDict
-from typing import Union
+from typing import List, Union
 
 try:
     from collections import Iterable
@@ -27,12 +27,12 @@
     RotationEquivariantScale2DLayer,
 )
 from ..squeeze_excitation import SqueezeExcitationBlock
-from .base import Core
+from .base import ConvCore, Core
 
 logger = logging.getLogger(__name__)
 
 
-class Stacked2dCore(Core, nn.Module):
+class Stacked2dCore(ConvCore, nn.Module):
     """
     An instantiation of the Core base class. Made up of layers layers of nn.sequential modules.
     Allows for the flexible implementations of many different architectures, such as convolutional layers,
@@ -53,12 +53,12 @@ def __init__(
         input_stride=1,
         final_nonlinearity=True,
         elu_shift=(0, 0),
-        bias: Union[bool, list[bool]] = True,
+        bias: Union[bool, List[bool]] = True,
         momentum=0.1,
         pad_input=True,
         hidden_padding=None,
-        batch_norm: Union[bool, list[bool]] = True,
-        batch_norm_scale: Union[bool, list[bool]] = True,
+        batch_norm: Union[bool, List[bool]] = True,
+        batch_norm_scale: Union[bool, List[bool]] = True,
         final_batchnorm_scale: bool = True,
         hidden_dilation=1,
         laplace_padding=0,
@@ -109,11 +109,6 @@ def __init__(
             linear:         Boolean, if True, removes all nonlinearities
             nonlinearity_type: String to set the used nonlinearity type loaded from neuralpredictors.layers.activation
             nonlinearity_config: Dict of the nonlinearities __init__ parameters.
-            To enable learning batch_norms bias and scale independently, the arguments bias, batch_norm and batch_norm_scale
-            work together: By default, all are true. In this case there won't be a bias learned in the convolutional layer, but
-            batch_norm will learn both its bias and scale. If batch_norm is false, but bias true, a bias will be learned in the
-            convolutional layer. If batch_norm and bias are true, but batch_norm_scale is false, batch_norm won't have learnable
-            parameters and a BiasLayer will be added after the batch_norm layer.
         """
 
         if depth_separable and attention_conv:
@@ -194,7 +189,6 @@ def __init__(
             warnings.warn(
                 "group sparsity can not be calculated for the requested conv type. Hidden channels will not be regularized and gamma_hidden is ignored."
             )
-        self.set_batchnorm_type()
         self.features = nn.Sequential()
         self.add_first_layer()
         self.add_subsequent_layers()
@@ -205,19 +199,6 @@ def set_batchnorm_type(self):
         self.bias_layer_cls = Bias2DLayer
         self.scale_layer_cls = Scale2DLayer
 
-    # def add_bn_layer(self, layer, hidden_channels):
-    def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
-        if self.batch_norm[layer_idx]:
-            hidden_channels = self.hidden_channels[layer_idx]
-            bias = self.bias[layer_idx]
-            scale = self.batch_norm_scale[layer_idx]
-
-            layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum, affine=bias and scale)
-            if bias and not scale:
-                layer["bias"] = self.bias_layer_cls(hidden_channels)
-            elif not bias and scale:
-                layer["scale"] = self.scale_layer_cls(hidden_channels)
-
     def penultimate_layer_built(self):
         """Returns True if the penultimate layer has been built."""
         return len(self.features) == self.num_layers - 1

diff --git a/neuralpredictors/layers/cores/conv3d.py b/neuralpredictors/layers/cores/conv3d.py
@@ -11,10 +11,10 @@
 
 from ...regularizers import DepthLaplaceL21d
 from ..affine import Bias3DLayer, Scale3DLayer
-from .base import Core
+from .base import ConvCore
 
 
-class Core3d(Core):
+class Core3d(ConvCore):
     def initialize(self, cuda=False):
         self.apply(self.init_conv)
         self.put_to_cuda(cuda=cuda)
@@ -30,6 +30,11 @@ def init_conv(m):
             if m.bias is not None:
                 m.bias.data.fill_(0)
 
+    def set_batchnorm_type(self):
+        self.batchnorm_layer_cls = nn.BatchNorm3d
+        self.bias_layer_cls = Bias3DLayer
+        self.scale_layer_cls = Scale3DLayer
+
 
 class Basic3dCore(Core3d, nn.Module):
     def __init__(
@@ -54,13 +59,11 @@ def __init__(
         input_regularizer="LaplaceL2norm",
         cuda=False,
         final_nonlin=True,
-        independent_bn_bias=True,
         spatial_dilation: int = 1,
         temporal_dilation: int = 1,
         hidden_spatial_dilation=1,
         hidden_temporal_dilation=1,
     ):
-
         """
         :param input_channels: integer, number of input channels as in
         :param hidden_channels:  number of hidden channels (i.e feature maps) in each hidden layer
@@ -85,14 +88,6 @@ def __init__(
                 zero is the default however to recreate backwards compatibility.
         :param input_regularizer: specifies what kind of spatial regularized is applied
         :param final_nonlin: bool specifiyng whether to include a nonlinearity after last convolutional layer in core
-        :param independent_bn_bias: If False, will allow for scaling the batch norm, so that batch norm
-                                    and bias can both be true. Defaults to True.
-
-        To enable learning batch_norms bias and scale independently, the arguments bias, batch_norm and batch_norm_scale
-        work together: By default, all are true. In this case there won't be a bias learned in the convolutional layer, but
-        batch_norm will learn both its bias and scale. If batch_norm is false, but bias true, a bias will be learned in the
-        convolutional layer. If batch_norm and bias are true, but batch_norm_scale is false, batch_norm won't have learnable
-        parameters and a BiasLayer will be added after the batch_norm layer.
         """
         super().__init__()
 
@@ -112,7 +107,6 @@ def __init__(
         self.bias = bias
         self.batch_norm = batch_norm
         self.batch_norm_scale = batch_norm_scale
-        self.independent_bn_bias = independent_bn_bias
         self.momentum = momentum
         self.spatial_dilation = spatial_dilation
         self.temporal_dilation = temporal_dilation
@@ -225,19 +219,6 @@ def laplace_temporal(self):
     def regularizer(self):
         return self.gamma_input_spatial * self.laplace_spatial(), self.gamma_input_temporal * self.laplace_temporal()
 
-    def add_bn_layer(self, layer, hidden_channels):
-        if self.batch_norm:
-            if self.independent_bn_bias:
-                layer["norm"] = nn.BatchNorm3d(hidden_channels, momentum=self.momentum)
-            else:
-                layer["norm"] = nn.BatchNorm3d(
-                    hidden_channels, momentum=self.momentum, affine=self.bias and self.batch_norm_scale
-                )
-                if self.bias and not self.batch_norm_scale:
-                    layer["bias"] = Bias3DLayer(hidden_channels)
-                elif self.batch_norm_scale:
-                    layer["scale"] = Scale3DLayer(hidden_channels)
-
     @property
     def out_channels(self):
         return self.hidden_channels[-1]
@@ -267,7 +248,6 @@ def __init__(
         batch_norm=True,
         padding=False,
         batch_norm_scale=True,
-        independent_bn_bias=True,
         momentum=0.01,
         laplace_padding=None,
         input_regularizer="LaplaceL2norm",
@@ -300,8 +280,6 @@ def __init__(
         :param batch_norm: bool specifying whether to include batch norm after convolution in core
         :param padding: whether to pad convolutions. Defaults to False.
         :param batch_norm_scale: bool, if True, a scaling factor after BN will be learned.
-        :param independent_bn_bias: If False, will allow for scaling the batch norm, so that batchnorm
-                                    and bias can both be true. Defaults to True.
         :param momentum: momentum for batch norm
         :param laplace_padding: padding size for the laplace convolution. If padding = None, it defaults to half of
                 the kernel size (recommended). Setting Padding to 0 is not recommended and leads to artefacts,
@@ -329,7 +307,6 @@ def __init__(
         self.bias = bias
         self.batch_norm = batch_norm
         self.batch_norm_scale = batch_norm_scale
-        self.independent_bn_bias = independent_bn_bias
         self.momentum = momentum
         self.stride = stride
         self.spatial_dilation = spatial_dilation
@@ -450,16 +427,3 @@ def get_kernels(self):
             (temporal_kernel,) + spatial_kernel
             for temporal_kernel, spatial_kernel in zip(self.temporal_hidden_kernel, self.spatial_hidden_kernel)
         ]
-
-    def add_bn_layer(self, layer, hidden_channels):
-        if self.batch_norm:
-            if self.independent_bn_bias:
-                layer["norm"] = nn.BatchNorm3d(hidden_channels, momentum=self.momentum)
-            else:
-                layer["norm"] = nn.BatchNorm3d(
-                    hidden_channels, momentum=self.momentum, affine=self.bias and self.batch_norm_scale
-                )
-                if self.bias and not self.batch_norm_scale:
-                    layer["bias"] = Bias3DLayer(hidden_channels)
-                elif self.batch_norm_scale:
-                    layer["scale"] = Scale3DLayer(hidden_channels)
diff --git a/neuralpredictors/training/early_stopping.py b/neuralpredictors/training/early_stopping.py
@@ -42,8 +42,8 @@ def early_stopping(
     tracker=None,
     scheduler=None,
     lr_decay_steps=1,
+    number_warmup_epochs=0,
 ):
-
     """
     Early stopping iterator. Keeps track of the best model state during training. Resets the model to its
         best state, when either the number of maximum epochs or the patience [number of epochs without improvement)
@@ -72,10 +72,30 @@ def early_stopping(
         tracker (Tracker):
             Tracker to be invoked for every epoch. `log_objective` is invoked with the current value of `objective`. Note that `finalize`
             method is NOT invoked.
-        scheduler:  scheduler object, which automatically reduces decreases the LR by a specified amount.
-                    The scheduler's `step` method is invoked, passing in the current value of `objective`
-        lr_decay_steps: Number of times the learning rate should be reduced before stopping the training.
+        scheduler:  scheduler object or tuple of two scheduler objects, which automatically modifies the LR by a specified amount.
+                    If a tuple of schedulers is provided the 1st scheduler is assumed to be the warm up scheduler. The .step method
+                    for the 1st scheduler will be called while epoch is smaller than number_warmup_epochs afterwards the .step method of
+                    the second scheduler is called. The current value of `objective` is passed to the `step` method if the scheduler at hand is `ReduceLROnPlateau`.
+                    For example a provided tuple of schedulers can be of the form:
+
+                                 scheduler = (warmup_scheduler,CosineAnnealingLR(*args,**kwargs))
+
+                    or in case that no scheduler is desired after the warm up:
+
+                                 scheduler = (warmup_scheduler,None).
+
+                    An example warm up scheduler can be defined as:
 
+                                def warmup_function(current_step: int):
+                                    return 1 / (2 ** (float(number_warmup_epochs - current_step - 1)))
+
+                                warmup_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_function)
+
+                    Of course single schedulers can also be provided.
+                    If the warm-up is shifted (goes to a to high learning rate or does not reach the desired learning rate),
+                    consider adjusting the warm up function accordingly.
+        lr_decay_steps: Number of times the learning rate should be reduced before stopping the training.
+        number_warmup_epochs: Number of warm-up epochs
     """
     training_status = model.training
 
@@ -107,11 +127,41 @@ def finalize(model, best_state_dict):
     best_objective = current_objective = _objective()
     best_state_dict = copy_state(model)
 
+    # check if the learning rate scheduler is 'ReduceLROnPlateau' so that we pass the current_objective to step
+    reduce_lr_on_plateau = False
+    if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+        reduce_lr_on_plateau = True
+    elif isinstance(scheduler, tuple):
+        if isinstance(scheduler[1], torch.optim.lr_scheduler.ReduceLROnPlateau):
+            reduce_lr_on_plateau = True
+
+    # check if warm up is to be performed
+    if isinstance(scheduler, tuple):
+        warmup = True
+
+        # check if the warm-up scheduler is not of type None
+        if scheduler[0] is None:
+            logger.warning(
+                f"Provided warm up scheduler is of type None. Warm up epochs set to {number_warmup_epochs}. Setting number of warm up epochs to 0"
+            )
+            number_warmup_epochs = 0
+    else:
+        warmup = False
+
+    # check if warm up scheduler and number of warm-up epochs is provided
+    if warmup and number_warmup_epochs == 0:
+        logger.warning("Warm up scheduler is provided, but number of warm up steps is set to 0")
+
+    # inform user that no warm-up scheduler is provided althouth warm-up epochs is non zero
+    elif not warmup and number_warmup_epochs > 0:
+        logger.warning(
+            f"Number of warm up steps is set to {number_warmup_epochs}, but no warm up scheduler is provided"
+        )
+
     for repeat in range(lr_decay_steps):
         patience_counter = 0
 
         while patience_counter < patience and epoch < max_iter:
-
             for _ in range(interval):
                 epoch += 1
                 if tracker is not None:
@@ -124,9 +174,24 @@ def finalize(model, best_state_dict):
 
             current_objective = _objective()
 
-            # if a scheduler is defined, a .step with the current objective is all that is needed to reduce the LR
+            # if a scheduler is defined, a .step with or without the current objective is all that is needed to reduce the LR
             if scheduler is not None:
-                scheduler.step(current_objective)
+                if warmup and epoch < number_warmup_epochs:
+                    # warm-up step
+                    scheduler[0].step()
+                elif reduce_lr_on_plateau:
+                    # reduce_lr_on_plateau requires current objective for the step
+                    if not warmup:
+                        scheduler.step(current_objective)
+                    else:
+                        scheduler[1].step(current_objective)
+                else:
+                    # .step() for the rest of the schedulers
+                    if not warmup:
+                        scheduler.step()
+                    else:
+                        if scheduler[1] is not None:
+                            scheduler[1].step()
 
             if current_objective * maximize < best_objective * maximize - tolerance:
                 logger.info(f"[{epoch:03d}|{patience_counter:02d}/{patience:02d}] ---> {current_objective}")