From 029c6d8745f07189634b54acb2cc9db76a137ac3 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 09:51:41 +0000
Subject: [PATCH 1/7] adding one-hot encoding to embedding_layer

---
 mambular/arch_utils/embedding_layer.py | 34 ++++++++++++++++++++------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/mambular/arch_utils/embedding_layer.py b/mambular/arch_utils/embedding_layer.py
index cd115f5..43fe453 100644
--- a/mambular/arch_utils/embedding_layer.py
+++ b/mambular/arch_utils/embedding_layer.py
@@ -12,6 +12,7 @@ def __init__(
         layer_norm_after_embedding=False,
         use_cls=False,
         cls_position=0,
+        cat_encoding="int",
     ):
         """
         Embedding layer that handles numerical and categorical embeddings.
@@ -56,15 +57,23 @@ def __init__(
             ]
         )
 
-        self.cat_embeddings = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.Embedding(num_categories + 1, d_model),
-                    self.embedding_activation,
+        self.cat_embeddings = nn.ModuleList()
+        for feature_name, num_categories in cat_feature_info.items():
+            if cat_encoding == "int":
+                self.cat_embeddings.append(
+                    nn.Sequential(
+                        nn.Embedding(num_categories + 1, d_model),
+                        self.embedding_activation,
+                    )
+                )
+            elif cat_encoding == "one-hot":
+                self.cat_embeddings.append(
+                    nn.Sequential(
+                        OneHotEncoding(num_categories),
+                        nn.Linear(num_categories, d_model, bias=False),
+                        self.embedding_activation,
+                    )
                 )
-                for feature_name, num_categories in cat_feature_info.items()
-            ]
-        )
 
         if self.use_cls:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
@@ -143,3 +152,12 @@ def forward(self, num_features=None, cat_features=None):
                 )
 
         return x
+
+
+class OneHotEncoding(nn.Module):
+    def __init__(self, num_categories):
+        super(OneHotEncoding, self).__init__()
+        self.num_categories = num_categories
+
+    def forward(self, x):
+        return torch.nn.functional.one_hot(x, num_classes=self.num_categories).float()

From d413fd84b0544d0a535d1302c6f96bd4ac902de1 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 09:51:59 +0000
Subject: [PATCH 2/7] adding option to one-hot encode cat features in embedding
 layer

---
 mambular/base_models/ft_transformer.py | 5 ++++-
 mambular/base_models/mambular.py       | 9 ++++++---
 mambular/base_models/tabtransformer.py | 5 ++++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index da5e4ad..ddbf03c 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -132,9 +132,12 @@ def __init__(
             embedding_activation=self.hparams.get(
                 "embedding_activation", config.embedding_activation
             ),
-            layer_norm_after_embedding=self.hparams.get("layer_norm_after_embedding"),
+            layer_norm_after_embedding=self.hparams.get(
+                "layer_norm_after_embedding", config.layer_norm_after_embedding
+            ),
             use_cls=True,
             cls_position=0,
+            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 33b2b6f..d362b8a 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -150,9 +150,12 @@ def __init__(
             embedding_activation=self.hparams.get(
                 "embedding_activation", config.embedding_activation
             ),
-            layer_norm_after_embedding=self.hparams.get("layer_norm_after_embedding"),
-            use_cls=True,
-            cls_position=0,
+            layer_norm_after_embedding=self.hparams.get(
+                "layer_norm_after_embedding", config.layer_norm_after_embedding
+            ),
+            use_cls=False,
+            cls_position=-1,
+            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index 630b968..d9c5052 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -139,9 +139,12 @@ def __init__(
             embedding_activation=self.hparams.get(
                 "embedding_activation", config.embedding_activation
             ),
-            layer_norm_after_embedding=self.hparams.get("layer_norm_after_embedding"),
+            layer_norm_after_embedding=self.hparams.get(
+                "layer_norm_after_embedding", config.layer_norm_after_embedding
+            ),
             use_cls=True,
             cls_position=0,
+            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)

From 07164f5e5cbb78f8cfa2669013e2df9b7774b403 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 09:52:10 +0000
Subject: [PATCH 3/7] adjusting configs

---
 mambular/configs/fttransformer_config.py  | 1 +
 mambular/configs/mambular_config.py       | 1 +
 mambular/configs/tabtransformer_config.py | 1 +
 3 files changed, 3 insertions(+)

diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
index 11cc30c..35c3033 100644
--- a/mambular/configs/fttransformer_config.py
+++ b/mambular/configs/fttransformer_config.py
@@ -85,3 +85,4 @@ class DefaultFTTransformerConfig:
     layer_norm_eps: float = 1e-05
     transformer_dim_feedforward: int = 256
     numerical_embedding: str = "ple"
+    cat_encoding: str = "int"
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index c6fcd89..2ee5fe1 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -116,3 +116,4 @@ class DefaultMambularConfig:
     layer_norm_eps: float = 1e-05
     AD_weight_decay: bool = False
     BC_layer_norm: bool = True
+    cat_encoding: str = "int"
diff --git a/mambular/configs/tabtransformer_config.py b/mambular/configs/tabtransformer_config.py
index 31e63f4..f0206d6 100644
--- a/mambular/configs/tabtransformer_config.py
+++ b/mambular/configs/tabtransformer_config.py
@@ -84,3 +84,4 @@ class DefaultTabTransformerConfig:
     transformer_activation: callable = ReGLU()
     layer_norm_eps: float = 1e-05
     transformer_dim_feedforward: int = 512
+    cat_encoding: str = "int"

From 71cc68efedc642a5d260ba34162e0af37a9f47d7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 09:52:24 +0000
Subject: [PATCH 4/7] renaming sklearn class attributes

---
 mambular/models/sklearn_base_classifier.py | 28 +++++++++++-----------
 mambular/models/sklearn_base_lss.py        | 26 ++++++++++----------
 mambular/models/sklearn_base_regressor.py  | 24 ++++++++++---------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index ec39edc..f442688 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -37,7 +37,7 @@ def __init__(self, model, config, **kwargs):
         }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
-        self.model = None
+        self.task_model = None
 
         # Raise a warning if task is set to 'classification'
         if preprocessor_kwargs.get("task") == "regression":
@@ -194,7 +194,7 @@ def build_model(
 
         num_classes = len(np.unique(y))
 
-        self.model = TaskModel(
+        self.task_model = TaskModel(
             model_class=self.base_model,
             num_classes=num_classes,
             config=self.config,
@@ -237,10 +237,10 @@ def get_number_of_params(self, requires_grad=True):
         else:
             if requires_grad:
                 return sum(
-                    p.numel() for p in self.model.parameters() if p.requires_grad
+                    p.numel() for p in self.task_model.parameters() if p.requires_grad
                 )
             else:
-                return sum(p.numel() for p in self.model.parameters())
+                return sum(p.numel() for p in self.task_model.parameters())
 
     def fit(
         self,
@@ -345,7 +345,7 @@ def fit(
 
             num_classes = len(np.unique(y))
 
-            self.model = TaskModel(
+            self.task_model = TaskModel(
                 model_class=self.base_model,
                 num_classes=num_classes,
                 config=self.config,
@@ -379,12 +379,12 @@ def fit(
             ],
             **trainer_kwargs
         )
-        self.trainer.fit(self.model, self.data_module)
+        self.trainer.fit(self.task_model, self.data_module)
 
         best_model_path = checkpoint_callback.best_model_path
         if best_model_path:
             checkpoint = torch.load(best_model_path)
-            self.model.load_state_dict(checkpoint["state_dict"])
+            self.task_model.load_state_dict(checkpoint["state_dict"])
 
         return self
 
@@ -404,14 +404,14 @@ def predict(self, X):
             The predicted target values.
         """
         # Ensure model and data module are initialized
-        if self.model is None or self.data_module is None:
+        if self.task_model is None or self.data_module is None:
             raise ValueError("The model or data module has not been fitted yet.")
 
         # Preprocess the data using the data module
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.model.parameters()).device
+        device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -423,11 +423,11 @@ def predict(self, X):
             num_tensors = num_tensors.to(device)
 
         # Set model to evaluation mode
-        self.model.eval()
+        self.task_model.eval()
 
         # Perform inference
         with torch.no_grad():
-            logits = self.model(num_features=num_tensors, cat_features=cat_tensors)
+            logits = self.task_model(num_features=num_tensors, cat_features=cat_tensors)
 
             # Check the shape of the logits to determine binary or multi-class classification
             if logits.shape[1] == 1:
@@ -484,7 +484,7 @@ def predict_proba(self, X):
         # Preprocess the data
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
-        device = next(self.model.parameters()).device
+        device = next(self.task_model.parameters()).device
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
@@ -497,11 +497,11 @@ def predict_proba(self, X):
             num_tensors = num_tensors.to(device)
 
         # Set the model to evaluation mode
-        self.model.eval()
+        self.task_model.eval()
 
         # Perform inference
         with torch.no_grad():
-            logits = self.model(num_features=num_tensors, cat_features=cat_tensors)
+            logits = self.task_model(num_features=num_tensors, cat_features=cat_tensors)
             if logits.shape[1] > 1:
                 probabilities = torch.softmax(logits, dim=1)
             else:
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 62f2d3a..4e0d6e4 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -58,7 +58,7 @@ def __init__(self, model, config, **kwargs):
         }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
-        self.model = None
+        self.task_model = None
 
         # Raise a warning if task is set to 'classification'
         if preprocessor_kwargs.get("task") == "classification":
@@ -212,7 +212,7 @@ def build_model(
 
         num_classes = len(np.unique(y))
 
-        self.model = TaskModel(
+        self.task_model = TaskModel(
             model_class=self.base_model,
             num_classes=num_classes,
             config=self.config,
@@ -255,10 +255,10 @@ def get_number_of_params(self, requires_grad=True):
         else:
             if requires_grad:
                 return sum(
-                    p.numel() for p in self.model.parameters() if p.requires_grad
+                    p.numel() for p in self.task_model.parameters() if p.requires_grad
                 )
             else:
-                return sum(p.numel() for p in self.model.parameters())
+                return sum(p.numel() for p in self.task_model.parameters())
 
     def fit(
         self,
@@ -383,7 +383,7 @@ def fit(
             X, y, X_val, y_val, val_size=val_size, random_state=random_state
         )
 
-        self.model = TaskModel(
+        self.task_model = TaskModel(
             model_class=self.base_model,
             num_classes=self.family.param_count,
             family=self.family,
@@ -419,12 +419,12 @@ def fit(
             ],
             **trainer_kwargs
         )
-        self.trainer.fit(self.model, self.data_module)
+        self.trainer.fit(self.task_model, self.data_module)
 
         best_model_path = checkpoint_callback.best_model_path
         if best_model_path:
             checkpoint = torch.load(best_model_path)
-            self.model.load_state_dict(checkpoint["state_dict"])
+            self.task_model.load_state_dict(checkpoint["state_dict"])
 
         return self
 
@@ -444,14 +444,14 @@ def predict(self, X, raw=False):
             The predicted target values.
         """
         # Ensure model and data module are initialized
-        if self.model is None or self.data_module is None:
+        if self.task_model is None or self.data_module is None:
             raise ValueError("The model or data module has not been fitted yet.")
 
         # Preprocess the data using the data module
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.model.parameters()).device
+        device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -463,14 +463,14 @@ def predict(self, X, raw=False):
             num_tensors = num_tensors.to(device)
 
         # Set model to evaluation mode
-        self.model.eval()
+        self.task_model.eval()
 
         # Perform inference
         with torch.no_grad():
-            predictions = self.model(num_features=num_tensors, cat_features=cat_tensors)
+            predictions = self.task_model(num_features=num_tensors, cat_features=cat_tensors)
 
         if not raw:
-            return self.model.family(predictions).cpu().numpy()
+            return self.task_model.family(predictions).cpu().numpy()
 
         # Convert predictions to NumPy array and return
         else:
@@ -506,7 +506,7 @@ def evaluate(self, X, y_true, metrics=None, distribution_family=None):
         """
         # Infer distribution family from model settings if not provided
         if distribution_family is None:
-            distribution_family = getattr(self.model, "distribution_family", "normal")
+            distribution_family = getattr(self.task_model, "distribution_family", "normal")
 
         # Setup default metrics if none are provided
         if metrics is None:
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 30bedb9..1a098ac 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -37,7 +37,7 @@ def __init__(self, model, config, **kwargs):
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.base_model = model
-        self.model = None
+        self.task_model = None
         self.built = False
 
         # Raise a warning if task is set to 'classification'
@@ -190,7 +190,7 @@ def build_model(
             X, y, X_val, y_val, val_size=val_size, random_state=random_state
         )
 
-        self.model = TaskModel(
+        self.task_model = TaskModel(
             model_class=self.base_model,
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
@@ -232,10 +232,10 @@ def get_number_of_params(self, requires_grad=True):
         else:
             if requires_grad:
                 return sum(
-                    p.numel() for p in self.model.parameters() if p.requires_grad
+                    p.numel() for p in self.task_model.parameters() if p.requires_grad
                 )
             else:
-                return sum(p.numel() for p in self.model.parameters())
+                return sum(p.numel() for p in self.task_model.parameters())
 
     def fit(
         self,
@@ -336,7 +336,7 @@ def fit(
                 X, y, X_val, y_val, val_size=val_size, random_state=random_state
             )
 
-            self.model = TaskModel(
+            self.task_model = TaskModel(
                 model_class=self.base_model,
                 config=self.config,
                 cat_feature_info=self.data_module.cat_feature_info,
@@ -372,12 +372,12 @@ def fit(
             ],
             **trainer_kwargs
         )
-        self.trainer.fit(self.model, self.data_module)
+        self.trainer.fit(self.task_model, self.data_module)
 
         best_model_path = checkpoint_callback.best_model_path
         if best_model_path:
             checkpoint = torch.load(best_model_path)
-            self.model.load_state_dict(checkpoint["state_dict"])
+            self.task_model.load_state_dict(checkpoint["state_dict"])
 
         return self
 
@@ -397,14 +397,14 @@ def predict(self, X):
             The predicted target values.
         """
         # Ensure model and data module are initialized
-        if self.model is None or self.data_module is None:
+        if self.task_model is None or self.data_module is None:
             raise ValueError("The model or data module has not been fitted yet.")
 
         # Preprocess the data using the data module
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.model.parameters()).device
+        device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -416,11 +416,13 @@ def predict(self, X):
             num_tensors = num_tensors.to(device)
 
         # Set model to evaluation mode
-        self.model.eval()
+        self.task_model.eval()
 
         # Perform inference
         with torch.no_grad():
-            predictions = self.model(num_features=num_tensors, cat_features=cat_tensors)
+            predictions = self.task_model(
+                num_features=num_tensors, cat_features=cat_tensors
+            )
 
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()

From 53b77c5a4fa8256ace4ec52e7d660a86d4561991 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 09:52:37 +0000
Subject: [PATCH 5/7] adjusting class attribute in lightning wrapper

---
 mambular/base_models/lightning_wrapper.py | 42 ++++-------------------
 1 file changed, 6 insertions(+), 36 deletions(-)

diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index b26c643..6d3f5c3 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -82,7 +82,7 @@ def __init__(
         else:
             output_dim = num_classes
 
-        self.model = model_class(
+        self.base_model = model_class(
             config=config,
             num_feature_info=num_feature_info,
             cat_feature_info=cat_feature_info,
@@ -107,7 +107,7 @@ def forward(self, num_features, cat_features):
             Model output.
         """
 
-        return self.model.forward(num_features, cat_features)
+        return self.base_model.forward(num_features, cat_features)
 
     def compute_loss(self, predictions, y_true):
         """
@@ -168,16 +168,6 @@ def training_step(self, batch, batch_idx):
                     prog_bar=True,
                     logger=True,
                 )
-            elif isinstance(self.loss_fct, nn.MSELoss):
-                rmse = torch.sqrt(loss)
-                self.log(
-                    "train_rmse",
-                    rmse,
-                    on_step=True,
-                    on_epoch=True,
-                    prog_bar=True,
-                    logger=True,
-                )
 
         return loss
 
@@ -205,7 +195,7 @@ def validation_step(self, batch, batch_idx):
         self.log(
             "val_loss",
             val_loss,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
             prog_bar=True,
             logger=True,
@@ -218,17 +208,7 @@ def validation_step(self, batch, batch_idx):
                 self.log(
                     "val_acc",
                     acc,
-                    on_step=True,
-                    on_epoch=True,
-                    prog_bar=True,
-                    logger=True,
-                )
-            elif isinstance(self.loss_fct, nn.MSELoss):
-                rmse = torch.sqrt(val_loss)
-                self.log(
-                    "val_rmse",
-                    rmse,
-                    on_step=True,
+                    on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
@@ -272,17 +252,7 @@ def test_step(self, batch, batch_idx):
                 self.log(
                     "test_acc",
                     acc,
-                    on_step=True,
-                    on_epoch=True,
-                    prog_bar=True,
-                    logger=True,
-                )
-            elif isinstance(self.loss_fct, nn.MSELoss):
-                rmse = torch.sqrt(test_loss)
-                self.log(
-                    "test_rmse",
-                    rmse,
-                    on_step=True,
+                    on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
@@ -300,7 +270,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.model.parameters(),
+            self.base_model.parameters(),
             lr=self.lr,
             weight_decay=self.weight_decay,
         )

From 4f6b0882bdbfc6b564d316e5723c386f8acc88aa Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 10:09:36 +0000
Subject: [PATCH 6/7] adjusting config docstrings

---
 mambular/configs/fttransformer_config.py  | 3 ++-
 mambular/configs/mambular_config.py       | 6 ++++--
 mambular/configs/tabtransformer_config.py | 2 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
index 35c3033..a433753 100644
--- a/mambular/configs/fttransformer_config.py
+++ b/mambular/configs/fttransformer_config.py
@@ -58,6 +58,8 @@ class DefaultFTTransformerConfig:
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     """
 
     lr: float = 1e-04
@@ -84,5 +86,4 @@ class DefaultFTTransformerConfig:
     transformer_activation: callable = ReGLU()
     layer_norm_eps: float = 1e-05
     transformer_dim_feedforward: int = 256
-    numerical_embedding: str = "ple"
     cat_encoding: str = "int"
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index 2ee5fe1..4f3e495 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -76,9 +76,11 @@ class DefaultMambularConfig:
     layer_norm_eps : float, default=1e-05
         Epsilon value for layer normalization.
     AD_weight_decay : bool, default=False
-        whether weight decay is also applied to A-D matrices
+        whether weight decay is also applied to A-D matrices.
     BC_layer_norm: bool, default=True
-        whether to apply layer normalization to B-C matrices
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     """
 
     lr: float = 1e-04
diff --git a/mambular/configs/tabtransformer_config.py b/mambular/configs/tabtransformer_config.py
index f0206d6..a1131c9 100644
--- a/mambular/configs/tabtransformer_config.py
+++ b/mambular/configs/tabtransformer_config.py
@@ -58,6 +58,8 @@ class DefaultTabTransformerConfig:
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     """
 
     lr: float = 1e-04

From 71f35e658119cc5ce1d141e1d2ae7c7d0f9e6024 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 5 Aug 2024 10:09:48 +0000
Subject: [PATCH 7/7] adjusting docstrings for documentation

---
 mambular/models/fttransformer.py  |  6 ++++++
 mambular/models/mambular.py       | 24 ++++++++++++++++++++++++
 mambular/models/tabtransformer.py |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/mambular/models/fttransformer.py b/mambular/models/fttransformer.py
index 71d3653..efd346e 100644
--- a/mambular/models/fttransformer.py
+++ b/mambular/models/fttransformer.py
@@ -64,6 +64,8 @@ class FTTransformerRegressor(SklearnBaseRegressor):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -171,6 +173,8 @@ class FTTransformerClassifier(SklearnBaseClassifier):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -278,6 +282,8 @@ class FTTransformerLSS(SklearnBaseLSS):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
diff --git a/mambular/models/mambular.py b/mambular/models/mambular.py
index 6fc147c..09a518a 100644
--- a/mambular/models/mambular.py
+++ b/mambular/models/mambular.py
@@ -79,6 +79,14 @@ class MambularRegressor(SklearnBaseRegressor):
         Whether to append a cls to the end of each 'sequence'.
     shuffle_embeddings : bool, default=False.
         Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=False
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=True
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -198,6 +206,14 @@ class MambularClassifier(SklearnBaseClassifier):
         Whether to use learnable feature interactions before passing through mamba blocks.
     shuffle_embeddings : bool, default=False.
         Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=False
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=True
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -320,6 +336,14 @@ class MambularLSS(SklearnBaseLSS):
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
     shuffle_embeddings : bool, default=False.
         Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=False
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=True
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     numerical_preprocessing : str, default="ple"
         The preprocessing strategy for numerical features. Valid options are
         'binning', 'one_hot', 'standardization', and 'normalization'.
diff --git a/mambular/models/tabtransformer.py b/mambular/models/tabtransformer.py
index 5cd3787..901369e 100644
--- a/mambular/models/tabtransformer.py
+++ b/mambular/models/tabtransformer.py
@@ -63,6 +63,8 @@ class TabTransformerRegressor(SklearnBaseRegressor):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -170,6 +172,8 @@ class TabTransformerClassifier(SklearnBaseClassifier):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -277,6 +281,8 @@ class TabTransformerLSS(SklearnBaseLSS):
         Epsilon value for layer normalization.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.