From a032ae0d8fb98b002859bb98f301e637a4a39d4d Mon Sep 17 00:00:00 2001
From: "viktor.dobrev" <viktor.dobrev@phobos.cidas.uni-goettingen.de>
Date: Thu, 11 May 2023 17:57:24 +0200
Subject: [PATCH 01/16] support for other pytorch schedulers and warm up

---
 neuralpredictors/training/early_stopping.py | 70 ++++++++++++++++++---
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/neuralpredictors/training/early_stopping.py b/neuralpredictors/training/early_stopping.py
index 2ddb4dd7..f78a9cd3 100644
--- a/neuralpredictors/training/early_stopping.py
+++ b/neuralpredictors/training/early_stopping.py
@@ -42,8 +42,8 @@ def early_stopping(
     tracker=None,
     scheduler=None,
     lr_decay_steps=1,
+    number_warmup_epochs=0,
 ):
-
     """
     Early stopping iterator. Keeps track of the best model state during training. Resets the model to its
         best state, when either the number of maximum epochs or the patience [number of epochs without improvement)
@@ -72,10 +72,29 @@ def early_stopping(
         tracker (Tracker):
             Tracker to be invoked for every epoch. `log_objective` is invoked with the current value of `objective`. Note that `finalize`
             method is NOT invoked.
-        scheduler:  scheduler object, which automatically reduces decreases the LR by a specified amount.
-                    The scheduler's `step` method is invoked, passing in the current value of `objective`
-        lr_decay_steps: Number of times the learning rate should be reduced before stopping the training.
+        scheduler:  scheduler object or tuple of two scheduler objects, which automatically modifies the LR by a specified amount.
+                    The scheduler's `step` method is invoked for a the approptiate scheduler if a tuple of two schedulers is provided.
+                    The current value of `objective` is passed to the `step` method if the scheduler at hand is `ReduceLROnPlateau`.
+                    For example a provided tuple of scheduler can be of the form:
+
+                                 scheduler = (warmup_scheduler,CosineAnnealingLR(*args,**kwargs))
+
+                    or in case that no scheduler is desired after the warm up:
+
+                                 scheduler = (warmup_scheduler,None).
+
+                    An example warm up scheduler can be defined as:
 
+                                def warmup_function(current_step: int):
+                                    return 1 / (2 ** (float(number_warmup_epochs - current_step - 1)))
+
+                                warmup_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_function)
+
+                    Of course single schedulers can also be provided.
+                    If the warm-up is shifted (goes to a to high learning rate or does not reach the desired learning rate),
+                    consider adjusting the warm up function accordingly.
+        lr_decay_steps: Number of times the learning rate should be reduced before stopping the training.
+        number_warmup_epochs: Number of warm-up epochs
     """
     training_status = model.training
 
@@ -107,11 +126,36 @@ def finalize(model, best_state_dict):
     best_objective = current_objective = _objective()
     best_state_dict = copy_state(model)
 
+    # check if the learning rate scheduler is 'ReduceLROnPlateau' so that we pass the current_objective to step
+    reduce_lr_on_plateau = False
+    if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+        reduce_lr_on_plateau = True
+    elif isinstance(scheduler, tuple):
+        if isinstance(scheduler[1], torch.optim.lr_scheduler.ReduceLROnPlateau):
+            reduce_lr_on_plateau = True
+
+    # check if warm up is to be performed
+    if isinstance(scheduler, tuple):
+        warmup = True
+        if scheduler[0] is None:
+            logger.warning(
+                f"Provided warm up scheduler is of type None. Warm up epochs set to {number_warmup_epochs}. Setting number of warm up epochs to 0"
+            )
+            number_warmup_epochs = 0
+    else:
+        warmup = False
+
+    if warmup and number_warmup_epochs == 0:
+        logger.warning("Warm up scheduler is provided, but number of warm up steps is set to 0")
+    elif not warmup and number_warmup_epochs > 0:
+        logger.warning(
+            f"Number of warm up steps is set to {number_warmup_epochs}, but no warm up scheduler is provided"
+        )
+
     for repeat in range(lr_decay_steps):
         patience_counter = 0
 
         while patience_counter < patience and epoch < max_iter:
-
             for _ in range(interval):
                 epoch += 1
                 if tracker is not None:
@@ -124,9 +168,21 @@ def finalize(model, best_state_dict):
 
             current_objective = _objective()
 
-            # if a scheduler is defined, a .step with the current objective is all that is needed to reduce the LR
+            # if a scheduler is defined, a .step with or without the current objective is all that is needed to reduce the LR
             if scheduler is not None:
-                scheduler.step(current_objective)
+                if warmup and epoch < number_warmup_epochs:
+                    scheduler[0].step()
+                elif reduce_lr_on_plateau:
+                    if not warmup:
+                        scheduler.step(current_objective)
+                    else:
+                        scheduler[1].step(current_objective)
+                else:
+                    if not warmup:
+                        scheduler.step()
+                    else:
+                        if scheduler[1] is not None:
+                            scheduler[1].step()
 
             if current_objective * maximize < best_objective * maximize - tolerance:
                 logger.info(f"[{epoch:03d}|{patience_counter:02d}/{patience:02d}] ---> {current_objective}")

From c0a89a44aeefd438bb791528546d5c63571344e5 Mon Sep 17 00:00:00 2001
From: "viktor.dobrev" <viktor.dobrev@phobos.cidas.uni-goettingen.de>
Date: Mon, 22 May 2023 14:06:31 +0200
Subject: [PATCH 02/16] better in-line comments for schedulers

---
 neuralpredictors/training/early_stopping.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/neuralpredictors/training/early_stopping.py b/neuralpredictors/training/early_stopping.py
index f78a9cd3..c101c44f 100644
--- a/neuralpredictors/training/early_stopping.py
+++ b/neuralpredictors/training/early_stopping.py
@@ -73,9 +73,10 @@ def early_stopping(
             Tracker to be invoked for every epoch. `log_objective` is invoked with the current value of `objective`. Note that `finalize`
             method is NOT invoked.
         scheduler:  scheduler object or tuple of two scheduler objects, which automatically modifies the LR by a specified amount.
-                    The scheduler's `step` method is invoked for a the approptiate scheduler if a tuple of two schedulers is provided.
-                    The current value of `objective` is passed to the `step` method if the scheduler at hand is `ReduceLROnPlateau`.
-                    For example a provided tuple of scheduler can be of the form:
+                    If a tuple of schedulers is provided the 1st scheduler is assumed to be the warm up scheduler. The .step method
+                    for the 1st scheduler will be called while epoch is smaller than number_warmup_epochs afterwards the .step method of
+                    the second scheduler is called. The current value of `objective` is passed to the `step` method if the scheduler at hand is `ReduceLROnPlateau`.
+                    For example a provided tuple of schedulers can be of the form:
 
                                  scheduler = (warmup_scheduler,CosineAnnealingLR(*args,**kwargs))
 
@@ -137,6 +138,8 @@ def finalize(model, best_state_dict):
     # check if warm up is to be performed
     if isinstance(scheduler, tuple):
         warmup = True
+
+        # check if the warm-up scheduler is not of type None
         if scheduler[0] is None:
             logger.warning(
                 f"Provided warm up scheduler is of type None. Warm up epochs set to {number_warmup_epochs}. Setting number of warm up epochs to 0"
@@ -145,8 +148,11 @@ def finalize(model, best_state_dict):
     else:
         warmup = False
 
+    # check if warm up scheduler and number of warm-up epochs is provided
     if warmup and number_warmup_epochs == 0:
         logger.warning("Warm up scheduler is provided, but number of warm up steps is set to 0")
+
+    # inform user that no warm-up scheduler is provided althouth warm-up epochs is non zero
     elif not warmup and number_warmup_epochs > 0:
         logger.warning(
             f"Number of warm up steps is set to {number_warmup_epochs}, but no warm up scheduler is provided"
@@ -171,13 +177,16 @@ def finalize(model, best_state_dict):
             # if a scheduler is defined, a .step with or without the current objective is all that is needed to reduce the LR
             if scheduler is not None:
                 if warmup and epoch < number_warmup_epochs:
+                    # warm-up step
                     scheduler[0].step()
                 elif reduce_lr_on_plateau:
+                    # reduce_lr_on_plateau requires current objective for the step
                     if not warmup:
                         scheduler.step(current_objective)
                     else:
                         scheduler[1].step(current_objective)
                 else:
+                    # .step() for the rest of the schedulers
                     if not warmup:
                         scheduler.step()
                     else:

From 3a3554619c54f4693d30370f2f0eef19e7706475 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 13:39:18 +0100
Subject: [PATCH 03/16] remove obsolete independent_bn_bias argument

---
 neuralpredictors/layers/cores/conv2d.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index d7fd2e7d..956f8692 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -57,7 +57,6 @@ def __init__(
         momentum=0.1,
         pad_input=True,
         hidden_padding=None,
-        independent_bn_bias=True,
         batch_norm: Union[bool, list[bool]] = True,
         batch_norm_scale: Union[bool, list[bool]] = True,
         final_batchnorm_scale: bool = True,
@@ -93,8 +92,6 @@ def __init__(
             batch_norm:     Boolean, if True appends a BN layer after each convolutional layer
             batch_norm_scale: If True, learns BN including the scaling factor
             final_batchnorm_scale: Deprecated. If batch_norm_scale passed as an Iterable, this will be ignored.
-            independent_bn_bias: Deprecated. If False, will allow for scaling the batch norm, so that batchnorm
-                                    and bias can both be true. Defaults to True.
             hidden_dilation:    If set to > 1, will apply dilated convs for all hidden layers
             laplace_padding: Padding size for the laplace convolution. If padding = None, it defaults to half of
                 the kernel size (recommended). Setting Padding to 0 is not recommended and leads to artefacts,
@@ -137,13 +134,6 @@ def __init__(
 
         self.bias = bias if isinstance(bias, Iterable) else [bias] * layers
 
-        self.independent_bn_bias = independent_bn_bias
-        if self.independent_bn_bias and not all(self.bias) and not all(self.batch_norm_scale):
-            warnings.warn(
-                """The default of `independent_bn_bias=True` will ignore the kwargs `bias`, `batch_norm_scale`. 
-                    If you want to use these arguments, please set `independent_bn_bias=False`."""
-            )
-
         super().__init__()
         regularizer_config = (
             dict(padding=laplace_padding, kernel=input_kern)
@@ -219,11 +209,6 @@ def set_batchnorm_type(self):
     def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
         if self.batch_norm[layer_idx]:
             hidden_channels = self.hidden_channels[layer_idx]
-
-            if self.independent_bn_bias:
-                layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum)
-                return
-
             bias = self.bias[layer_idx]
             scale = self.batch_norm_scale[layer_idx]
 

From 4bf5b54f54e5b827c516932c618dcfdc14a66333 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 14:32:59 +0100
Subject: [PATCH 04/16] Fix a Bug in SE2dCore when skip > 1

---
 neuralpredictors/layers/cores/conv2d.py | 44 +++++++++++--------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index 956f8692..2e764d97 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -245,25 +245,27 @@ def add_first_layer(self):
         self.add_activation(layer)
         self.features.add_module("layer0", nn.Sequential(layer))
 
+    def add_subsequent_conv_layer(self, layer: OrderedDict, l: int) -> None:
+        layer[self.conv_layer_name] = self.ConvLayer(
+            in_channels=self.hidden_channels[l - 1]
+            if not self.skip > 1
+            else min(self.skip, l) * self.hidden_channels[0],
+            out_channels=self.hidden_channels[l],
+            kernel_size=self.hidden_kern[l - 1],
+            stride=self.stride,
+            padding=self.hidden_padding or ((self.hidden_kern[l - 1] - 1) * self.hidden_dilation + 1) // 2,
+            dilation=self.hidden_dilation,
+            bias=self.bias,
+        )
+
     def add_subsequent_layers(self):
         if not isinstance(self.hidden_kern, Iterable):
             self.hidden_kern = [self.hidden_kern] * (self.num_layers - 1)
 
         for l in range(1, self.num_layers):
             layer = OrderedDict()
-            if self.hidden_padding is None:
-                self.hidden_padding = ((self.hidden_kern[l - 1] - 1) * self.hidden_dilation + 1) // 2
-            layer[self.conv_layer_name] = self.ConvLayer(
-                in_channels=self.hidden_channels[l - 1]
-                if not self.skip > 1
-                else min(self.skip, l) * self.hidden_channels[0],
-                out_channels=self.hidden_channels[l],
-                kernel_size=self.hidden_kern[l - 1],
-                stride=self.stride,
-                padding=self.hidden_padding,
-                dilation=self.hidden_dilation,
-                bias=self.bias,
-            )
+
+            self.add_subsequent_conv_layer(layer, l)
             self.add_bn_layer(layer, l)
             self.add_activation(layer)
             self.features.add_module("layer{}".format(l), nn.Sequential(layer))
@@ -345,6 +347,9 @@ def __init__(
         self.init_std = init_std
         super().__init__(*args, **kwargs, input_regularizer=input_regularizer)
 
+        if self.skip > 0:
+            raise NotImplementedError("Skip connections are not implemented for RotationEquivariant2dCore")
+
     def set_batchnorm_type(self):
         if not self.rot_eq_batch_norm:
             self.batchnorm_layer_cls = nn.BatchNorm2d
@@ -588,17 +593,8 @@ def add_subsequent_layers(self):
 
         for l in range(1, self.num_layers):
             layer = OrderedDict()
-            if self.hidden_padding is None:
-                self.hidden_padding = ((self.hidden_kern[l - 1] - 1) * self.hidden_dilation + 1) // 2
-            layer[self.conv_layer_name] = self.ConvLayer(
-                in_channels=self.hidden_channels if not self.skip > 1 else min(self.skip, l) * self.hidden_channels,
-                out_channels=self.hidden_channels,
-                kernel_size=self.hidden_kern[l - 1],
-                stride=self.stride,
-                padding=self.hidden_padding,
-                dilation=self.hidden_dilation,
-                bias=self.bias,
-            )
+
+            self.add_subsequent_conv_layer(layer, l)
             self.add_bn_layer(layer, l)
             self.add_activation(layer)
             if (self.num_layers - l) <= self.n_se_blocks:

From ceaba0db20236525ae75c9f91525d2b7e2b6297f Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 16:03:17 +0100
Subject: [PATCH 05/16] move add_bn_layer to Core(ABC)

---
 neuralpredictors/layers/cores/base.py   | 35 +++++++++++++++++++++++++
 neuralpredictors/layers/cores/conv2d.py | 19 --------------
 neuralpredictors/layers/cores/conv3d.py | 18 ++++---------
 3 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index a6936204..1122030e 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 
 from torch import nn
 
@@ -8,6 +9,10 @@ class Core(ABC):
     Base class for the core models, taking 2d inputs and computing nonlinear features.
     """
 
+    def __init__(self) -> None:
+        super().__init__()
+        self.set_batchnorm_type()
+
     def initialize(self):
         """
         Initialization applied on the core.
@@ -29,6 +34,36 @@ def init_conv(m):
             if m.bias is not None:
                 m.bias.data.fill_(0)
 
+    @abstractmethod
+    def set_batchnorm_type(self):
+        """
+        Set batchnorm_layer_cls, bias_layer_cls, scale_layer_cls class attributes
+        """
+        self.batchnorm_layer_cls = None
+        self.bias_layer_cls = None
+        self.scale_layer_cls = None
+
+    def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
+        for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum"]:
+            if not hasattr(self, attr):
+                raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
+
+        if self.batch_norm[layer_idx]:
+            hidden_channels = self.hidden_channels[layer_idx]
+
+            if self.independent_bn_bias:
+                layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum)
+                return
+
+            bias = self.bias[layer_idx]
+            scale = self.batch_norm_scale[layer_idx]
+
+            layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum, affine=bias and scale)
+            if bias and not scale:
+                layer["bias"] = self.bias_layer_cls(hidden_channels)
+            elif not bias and scale:
+                layer["scale"] = self.scale_layer_cls(hidden_channels)
+
     @abstractmethod
     def regularizer(self):
         """
diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index d7fd2e7d..456856bd 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -204,7 +204,6 @@ def __init__(
             warnings.warn(
                 "group sparsity can not be calculated for the requested conv type. Hidden channels will not be regularized and gamma_hidden is ignored."
             )
-        self.set_batchnorm_type()
         self.features = nn.Sequential()
         self.add_first_layer()
         self.add_subsequent_layers()
@@ -215,24 +214,6 @@ def set_batchnorm_type(self):
         self.bias_layer_cls = Bias2DLayer
         self.scale_layer_cls = Scale2DLayer
 
-    # def add_bn_layer(self, layer, hidden_channels):
-    def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
-        if self.batch_norm[layer_idx]:
-            hidden_channels = self.hidden_channels[layer_idx]
-
-            if self.independent_bn_bias:
-                layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum)
-                return
-
-            bias = self.bias[layer_idx]
-            scale = self.batch_norm_scale[layer_idx]
-
-            layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum, affine=bias and scale)
-            if bias and not scale:
-                layer["bias"] = self.bias_layer_cls(hidden_channels)
-            elif not bias and scale:
-                layer["scale"] = self.scale_layer_cls(hidden_channels)
-
     def penultimate_layer_built(self):
         """Returns True if the penultimate layer has been built."""
         return len(self.features) == self.num_layers - 1
diff --git a/neuralpredictors/layers/cores/conv3d.py b/neuralpredictors/layers/cores/conv3d.py
index 6950f981..38cf99a4 100644
--- a/neuralpredictors/layers/cores/conv3d.py
+++ b/neuralpredictors/layers/cores/conv3d.py
@@ -428,6 +428,11 @@ def __init__(
             self.features.add_module("layer{}".format(l + 1), nn.Sequential(layer))
         self.initialize(cuda=cuda)
 
+    def set_batchnorm_type(self):
+        self.batchnorm_layer_cls = nn.BatchNorm3d
+        self.bias_layer_cls = Bias3DLayer
+        self.scale_layer_cls = Scale3DLayer
+
     def forward(self, x):
         for features in self.features:
             x = features(x)
@@ -450,16 +455,3 @@ def get_kernels(self):
             (temporal_kernel,) + spatial_kernel
             for temporal_kernel, spatial_kernel in zip(self.temporal_hidden_kernel, self.spatial_hidden_kernel)
         ]
-
-    def add_bn_layer(self, layer, hidden_channels):
-        if self.batch_norm:
-            if self.independent_bn_bias:
-                layer["norm"] = nn.BatchNorm3d(hidden_channels, momentum=self.momentum)
-            else:
-                layer["norm"] = nn.BatchNorm3d(
-                    hidden_channels, momentum=self.momentum, affine=self.bias and self.batch_norm_scale
-                )
-                if self.bias and not self.batch_norm_scale:
-                    layer["bias"] = Bias3DLayer(hidden_channels)
-                elif self.batch_norm_scale:
-                    layer["scale"] = Scale3DLayer(hidden_channels)

From e77ccacb4997080842b5a941cef00868ad504485 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 16:19:56 +0100
Subject: [PATCH 06/16] add ConvCore for convolutional models only

---
 neuralpredictors/layers/cores/base.py   | 56 +++++++++++++------------
 neuralpredictors/layers/cores/conv2d.py |  4 +-
 neuralpredictors/layers/cores/conv3d.py |  4 +-
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index 1122030e..2bee95dc 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -9,10 +9,6 @@ class Core(ABC):
     Base class for the core models, taking 2d inputs and computing nonlinear features.
     """
 
-    def __init__(self) -> None:
-        super().__init__()
-        self.set_batchnorm_type()
-
     def initialize(self):
         """
         Initialization applied on the core.
@@ -34,6 +30,35 @@ def init_conv(m):
             if m.bias is not None:
                 m.bias.data.fill_(0)
 
+    @abstractmethod
+    def regularizer(self):
+        """
+        Regularization applied on the core. Returns a scalar value.
+        """
+
+    @abstractmethod
+    def forward(self, x):
+        """
+        Forward function for pytorch nn module.
+
+        Args:
+            x (torch.tensor): input of shape (batch, channels, height, width)
+        """
+
+    def __repr__(self):
+        s = super().__repr__()
+        s += f" [{self.__class__.__name__} regularizers: "
+        ret = []
+        for attr in filter(lambda x: "gamma" in x or "skip" in x, dir(self)):
+            ret.append(f"{attr} = {getattr(self, attr)}")
+        return s + "|".join(ret) + "]\n"
+
+
+class ConvCore(Core):
+    def __init__(self) -> None:
+        super().__init__()
+        self.set_batchnorm_type()
+
     @abstractmethod
     def set_batchnorm_type(self):
         """
@@ -63,26 +88,3 @@ def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
                 layer["bias"] = self.bias_layer_cls(hidden_channels)
             elif not bias and scale:
                 layer["scale"] = self.scale_layer_cls(hidden_channels)
-
-    @abstractmethod
-    def regularizer(self):
-        """
-        Regularization applied on the core. Returns a scalar value.
-        """
-
-    @abstractmethod
-    def forward(self, x):
-        """
-        Forward function for pytorch nn module.
-
-        Args:
-            x (torch.tensor): input of shape (batch, channels, height, width)
-        """
-
-    def __repr__(self):
-        s = super().__repr__()
-        s += f" [{self.__class__.__name__} regularizers: "
-        ret = []
-        for attr in filter(lambda x: "gamma" in x or "skip" in x, dir(self)):
-            ret.append(f"{attr} = {getattr(self, attr)}")
-        return s + "|".join(ret) + "]\n"
diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index 456856bd..1f97df3c 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -27,12 +27,12 @@
     RotationEquivariantScale2DLayer,
 )
 from ..squeeze_excitation import SqueezeExcitationBlock
-from .base import Core
+from .base import ConvCore, Core
 
 logger = logging.getLogger(__name__)
 
 
-class Stacked2dCore(Core, nn.Module):
+class Stacked2dCore(ConvCore, nn.Module):
     """
     An instantiation of the Core base class. Made up of layers layers of nn.sequential modules.
     Allows for the flexible implementations of many different architectures, such as convolutional layers,
diff --git a/neuralpredictors/layers/cores/conv3d.py b/neuralpredictors/layers/cores/conv3d.py
index 38cf99a4..c85f46c4 100644
--- a/neuralpredictors/layers/cores/conv3d.py
+++ b/neuralpredictors/layers/cores/conv3d.py
@@ -11,10 +11,10 @@
 
 from ...regularizers import DepthLaplaceL21d
 from ..affine import Bias3DLayer, Scale3DLayer
-from .base import Core
+from .base import ConvCore
 
 
-class Core3d(Core):
+class Core3d(ConvCore):
     def initialize(self, cuda=False):
         self.apply(self.init_conv)
         self.put_to_cuda(cuda=cuda)

From e83133e4890f9bdbb246f0afda4c70d8d51ed1ca Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 16:26:55 +0100
Subject: [PATCH 07/16] [add] documentation string

---
 neuralpredictors/layers/cores/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index 2bee95dc..6c84c209 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -56,6 +56,9 @@ def __repr__(self):
 
 class ConvCore(Core):
     def __init__(self) -> None:
+        """
+        Derived classes need to define "batch_norm", "hidden_channels", "independent_bn_bias", "momentum" attributes.
+        """
         super().__init__()
         self.set_batchnorm_type()
 

From f45769bf7973373336086fffee3fa09c188374bf Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 17:31:37 +0100
Subject: [PATCH 08/16] add type checking for abstract class attributes
 batch_norm and hidden_channels

---
 neuralpredictors/layers/cores/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index 6c84c209..4c2747b0 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -75,6 +75,9 @@ def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
         for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum"]:
             if not hasattr(self, attr):
                 raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
+        for attr in ["batch_norm", "hidden_channels"]:
+            if not isinstance(getattr(self, attr), list):
+                raise ValueError(f"`{attr}` must be a list.")
 
         if self.batch_norm[layer_idx]:
             hidden_channels = self.hidden_channels[layer_idx]

From 282ab8c11606c974eed70fcc40ae67b5979281a0 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 17:33:00 +0100
Subject: [PATCH 09/16] check for bias and batch_norm_scale arguments

---
 neuralpredictors/layers/cores/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index 4c2747b0..b161c8a3 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -72,7 +72,7 @@ def set_batchnorm_type(self):
         self.scale_layer_cls = None
 
     def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
-        for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum"]:
+        for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum", "bias", "batch_norm_scale"]:
             if not hasattr(self, attr):
                 raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
         for attr in ["batch_norm", "hidden_channels"]:

From bb0b005d7b2d223235011d1f60ee7cd9ec6b689c Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Thu, 7 Mar 2024 17:33:50 +0100
Subject: [PATCH 10/16] check types for bias and batch_norm_scale

---
 neuralpredictors/layers/cores/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index b161c8a3..e551e625 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -75,7 +75,7 @@ def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
         for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum", "bias", "batch_norm_scale"]:
             if not hasattr(self, attr):
                 raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
-        for attr in ["batch_norm", "hidden_channels"]:
+        for attr in ["batch_norm", "hidden_channels", "bias", "batch_norm_scale"]:
             if not isinstance(getattr(self, attr), list):
                 raise ValueError(f"`{attr}` must be a list.")
 

From 5120e7d3130d900e969d7f357ef302567fc78fe6 Mon Sep 17 00:00:00 2001
From: Polina Turishcheva <pturishcheva@gmail.com>
Date: Fri, 8 Mar 2024 11:00:12 +0100
Subject: [PATCH 11/16] fix typing import for python 3.8

---
 neuralpredictors/layers/cores/conv2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index 1f97df3c..b567122f 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -1,7 +1,7 @@
 import logging
 import warnings
 from collections import OrderedDict
-from typing import Union
+from typing import Union, List
 
 try:
     from collections import Iterable
@@ -53,13 +53,13 @@ def __init__(
         input_stride=1,
         final_nonlinearity=True,
         elu_shift=(0, 0),
-        bias: Union[bool, list[bool]] = True,
+        bias: Union[bool, List[bool]] = True,
         momentum=0.1,
         pad_input=True,
         hidden_padding=None,
         independent_bn_bias=True,
-        batch_norm: Union[bool, list[bool]] = True,
-        batch_norm_scale: Union[bool, list[bool]] = True,
+        batch_norm: Union[bool, List[bool]] = True,
+        batch_norm_scale: Union[bool, List[bool]] = True,
         final_batchnorm_scale: bool = True,
         hidden_dilation=1,
         laplace_padding=0,

From 0b55b1fad0415a85da0bba16a8ffc08331e965ef Mon Sep 17 00:00:00 2001
From: Polina Turishcheva <pturishcheva@gmail.com>
Date: Fri, 8 Mar 2024 11:23:08 +0100
Subject: [PATCH 12/16] isort fixed

---
 neuralpredictors/layers/cores/conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index b567122f..3412e286 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -1,7 +1,7 @@
 import logging
 import warnings
 from collections import OrderedDict
-from typing import Union, List
+from typing import List, Union
 
 try:
     from collections import Iterable

From c4858e9088dd8e83ab1a942b79886ce7e0c57b39 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Fri, 8 Mar 2024 11:38:53 +0100
Subject: [PATCH 13/16] merge

---
 neuralpredictors/layers/cores/conv2d.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv2d.py b/neuralpredictors/layers/cores/conv2d.py
index 87c45a5c..b3ffcee7 100644
--- a/neuralpredictors/layers/cores/conv2d.py
+++ b/neuralpredictors/layers/cores/conv2d.py
@@ -57,14 +57,8 @@ def __init__(
         momentum=0.1,
         pad_input=True,
         hidden_padding=None,
-<<<<<<< HEAD
-        batch_norm: Union[bool, list[bool]] = True,
-        batch_norm_scale: Union[bool, list[bool]] = True,
-=======
-        independent_bn_bias=True,
         batch_norm: Union[bool, List[bool]] = True,
         batch_norm_scale: Union[bool, List[bool]] = True,
->>>>>>> upstream/main
         final_batchnorm_scale: bool = True,
         hidden_dilation=1,
         laplace_padding=0,
@@ -115,11 +109,6 @@ def __init__(
             linear:         Boolean, if True, removes all nonlinearities
             nonlinearity_type: String to set the used nonlinearity type loaded from neuralpredictors.layers.activation
             nonlinearity_config: Dict of the nonlinearities __init__ parameters.
-            To enable learning batch_norms bias and scale independently, the arguments bias, batch_norm and batch_norm_scale
-            work together: By default, all are true. In this case there won't be a bias learned in the convolutional layer, but
-            batch_norm will learn both its bias and scale. If batch_norm is false, but bias true, a bias will be learned in the
-            convolutional layer. If batch_norm and bias are true, but batch_norm_scale is false, batch_norm won't have learnable
-            parameters and a BiasLayer will be added after the batch_norm layer.
         """
 
         if depth_separable and attention_conv:

From 0015784f0f34a1d4ef4bcaa5f8b7a0685da225e9 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Fri, 8 Mar 2024 11:44:47 +0100
Subject: [PATCH 14/16] remove independent bn bias

---
 neuralpredictors/layers/cores/base.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/neuralpredictors/layers/cores/base.py b/neuralpredictors/layers/cores/base.py
index e551e625..001d2d3b 100644
--- a/neuralpredictors/layers/cores/base.py
+++ b/neuralpredictors/layers/cores/base.py
@@ -57,7 +57,7 @@ def __repr__(self):
 class ConvCore(Core):
     def __init__(self) -> None:
         """
-        Derived classes need to define "batch_norm", "hidden_channels", "independent_bn_bias", "momentum" attributes.
+        Derived classes need to define "batch_norm", "hidden_channels", "momentum", "bias", "batch_norm_scale" attributes.
         """
         super().__init__()
         self.set_batchnorm_type()
@@ -72,7 +72,7 @@ def set_batchnorm_type(self):
         self.scale_layer_cls = None
 
     def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
-        for attr in ["batch_norm", "hidden_channels", "independent_bn_bias", "momentum", "bias", "batch_norm_scale"]:
+        for attr in ["batch_norm", "hidden_channels", "momentum", "bias", "batch_norm_scale"]:
             if not hasattr(self, attr):
                 raise NotImplementedError(f"Subclasses must have a `{attr}` attribute.")
         for attr in ["batch_norm", "hidden_channels", "bias", "batch_norm_scale"]:
@@ -82,10 +82,6 @@ def add_bn_layer(self, layer: OrderedDict, layer_idx: int):
         if self.batch_norm[layer_idx]:
             hidden_channels = self.hidden_channels[layer_idx]
 
-            if self.independent_bn_bias:
-                layer["norm"] = self.batchnorm_layer_cls(hidden_channels, momentum=self.momentum)
-                return
-
             bias = self.bias[layer_idx]
             scale = self.batch_norm_scale[layer_idx]
 

From 6588ba6cfc986a06152842f4a89eb9dd2eb72d12 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Fri, 8 Mar 2024 11:51:42 +0100
Subject: [PATCH 15/16] remove independent_bn_bias in conv3d, remove duplicate
 add_bn_layer function

---
 neuralpredictors/layers/cores/conv3d.py | 40 +++++--------------------
 1 file changed, 7 insertions(+), 33 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv3d.py b/neuralpredictors/layers/cores/conv3d.py
index c85f46c4..3ed4ebad 100644
--- a/neuralpredictors/layers/cores/conv3d.py
+++ b/neuralpredictors/layers/cores/conv3d.py
@@ -30,8 +30,14 @@ def init_conv(m):
             if m.bias is not None:
                 m.bias.data.fill_(0)
 
+    def set_batchnorm_type(self):
+        self.batchnorm_layer_cls = nn.BatchNorm3d
+        self.bias_layer_cls = Bias3DLayer
+        self.scale_layer_cls = Scale3DLayer
+
 
 class Basic3dCore(Core3d, nn.Module):
+
     def __init__(
         self,
         input_channels,
@@ -54,13 +60,11 @@ def __init__(
         input_regularizer="LaplaceL2norm",
         cuda=False,
         final_nonlin=True,
-        independent_bn_bias=True,
         spatial_dilation: int = 1,
         temporal_dilation: int = 1,
         hidden_spatial_dilation=1,
         hidden_temporal_dilation=1,
     ):
-
         """
         :param input_channels: integer, number of input channels as in
         :param hidden_channels:  number of hidden channels (i.e feature maps) in each hidden layer
@@ -85,14 +89,6 @@ def __init__(
                 zero is the default however to recreate backwards compatibility.
         :param input_regularizer: specifies what kind of spatial regularized is applied
         :param final_nonlin: bool specifiyng whether to include a nonlinearity after last convolutional layer in core
-        :param independent_bn_bias: If False, will allow for scaling the batch norm, so that batch norm
-                                    and bias can both be true. Defaults to True.
-
-        To enable learning batch_norms bias and scale independently, the arguments bias, batch_norm and batch_norm_scale
-        work together: By default, all are true. In this case there won't be a bias learned in the convolutional layer, but
-        batch_norm will learn both its bias and scale. If batch_norm is false, but bias true, a bias will be learned in the
-        convolutional layer. If batch_norm and bias are true, but batch_norm_scale is false, batch_norm won't have learnable
-        parameters and a BiasLayer will be added after the batch_norm layer.
         """
         super().__init__()
 
@@ -112,7 +108,6 @@ def __init__(
         self.bias = bias
         self.batch_norm = batch_norm
         self.batch_norm_scale = batch_norm_scale
-        self.independent_bn_bias = independent_bn_bias
         self.momentum = momentum
         self.spatial_dilation = spatial_dilation
         self.temporal_dilation = temporal_dilation
@@ -225,19 +220,6 @@ def laplace_temporal(self):
     def regularizer(self):
         return self.gamma_input_spatial * self.laplace_spatial(), self.gamma_input_temporal * self.laplace_temporal()
 
-    def add_bn_layer(self, layer, hidden_channels):
-        if self.batch_norm:
-            if self.independent_bn_bias:
-                layer["norm"] = nn.BatchNorm3d(hidden_channels, momentum=self.momentum)
-            else:
-                layer["norm"] = nn.BatchNorm3d(
-                    hidden_channels, momentum=self.momentum, affine=self.bias and self.batch_norm_scale
-                )
-                if self.bias and not self.batch_norm_scale:
-                    layer["bias"] = Bias3DLayer(hidden_channels)
-                elif self.batch_norm_scale:
-                    layer["scale"] = Scale3DLayer(hidden_channels)
-
     @property
     def out_channels(self):
         return self.hidden_channels[-1]
@@ -247,6 +229,7 @@ def get_kernels(self):
 
 
 class Factorized3dCore(Core3d, nn.Module):
+
     def __init__(
         self,
         input_channels,
@@ -267,7 +250,6 @@ def __init__(
         batch_norm=True,
         padding=False,
         batch_norm_scale=True,
-        independent_bn_bias=True,
         momentum=0.01,
         laplace_padding=None,
         input_regularizer="LaplaceL2norm",
@@ -300,8 +282,6 @@ def __init__(
         :param batch_norm: bool specifying whether to include batch norm after convolution in core
         :param padding: whether to pad convolutions. Defaults to False.
         :param batch_norm_scale: bool, if True, a scaling factor after BN will be learned.
-        :param independent_bn_bias: If False, will allow for scaling the batch norm, so that batchnorm
-                                    and bias can both be true. Defaults to True.
         :param momentum: momentum for batch norm
         :param laplace_padding: padding size for the laplace convolution. If padding = None, it defaults to half of
                 the kernel size (recommended). Setting Padding to 0 is not recommended and leads to artefacts,
@@ -329,7 +309,6 @@ def __init__(
         self.bias = bias
         self.batch_norm = batch_norm
         self.batch_norm_scale = batch_norm_scale
-        self.independent_bn_bias = independent_bn_bias
         self.momentum = momentum
         self.stride = stride
         self.spatial_dilation = spatial_dilation
@@ -428,11 +407,6 @@ def __init__(
             self.features.add_module("layer{}".format(l + 1), nn.Sequential(layer))
         self.initialize(cuda=cuda)
 
-    def set_batchnorm_type(self):
-        self.batchnorm_layer_cls = nn.BatchNorm3d
-        self.bias_layer_cls = Bias3DLayer
-        self.scale_layer_cls = Scale3DLayer
-
     def forward(self, x):
         for features in self.features:
             x = features(x)

From e354212e227793ef3e2d465739854e0f154f01b5 Mon Sep 17 00:00:00 2001
From: Max Burg <max.burg@bethgelab.org>
Date: Fri, 8 Mar 2024 12:04:49 +0100
Subject: [PATCH 16/16] older black version

---
 neuralpredictors/layers/cores/conv3d.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/neuralpredictors/layers/cores/conv3d.py b/neuralpredictors/layers/cores/conv3d.py
index 3ed4ebad..cd39d263 100644
--- a/neuralpredictors/layers/cores/conv3d.py
+++ b/neuralpredictors/layers/cores/conv3d.py
@@ -37,7 +37,6 @@ def set_batchnorm_type(self):
 
 
 class Basic3dCore(Core3d, nn.Module):
-
     def __init__(
         self,
         input_channels,
@@ -229,7 +228,6 @@ def get_kernels(self):
 
 
 class Factorized3dCore(Core3d, nn.Module):
-
     def __init__(
         self,
         input_channels,