diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 97189ad9..c9355abb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,4 +20,4 @@ repos:
     hooks:
       - id: mdformat
         additional_dependencies:
-          - mdformat-gfm==0.3.6
\ No newline at end of file
+          - mdformat-gfm==0.3.6
diff --git a/configs/embeddings_model.yaml b/configs/embeddings_model.yaml
new file mode 100644
index 00000000..0e5308d9
--- /dev/null
+++ b/configs/embeddings_model.yaml
@@ -0,0 +1,47 @@
+loader:
+  name: CustomReIDLoader
+
+model:
+  name: reid_test
+  nodes:
+    - name: GhostFaceNetV2
+
+    - name: GhostFaceNetHead
+      alias: color-embeddings
+      metadata_task_override: color
+      params:
+        embedding_size: 16
+
+      losses:
+        - name: SupConLoss
+          params:
+            miner: MultiSimilarityMiner
+            distance: CosineSimilarity
+            reducer: ThresholdReducer
+            reducer_params:
+              high: 0.3
+            regularizer: LpRegularizer
+
+      metrics:
+        - name: ClosestIsPositiveAccuracy
+
+        - name: MedianDistances
+
+      visualizers:
+        - name: EmbeddingsVisualizer
+
+loader:
+  params:
+    dataset_name: ParkingLot
+
+trainer:
+  preprocessing:
+    train_image_size: [256, 256]
+
+  batch_size: 16
+  epochs: 100
+  validation_interval: 10
+  n_log_images: 8
+
+  callbacks:
+    - name: ExportOnTrainEnd
diff --git a/luxonis_train/attached_modules/base_attached_module.py b/luxonis_train/attached_modules/base_attached_module.py
index a5f14761..7e177735 100644
--- a/luxonis_train/attached_modules/base_attached_module.py
+++ b/luxonis_train/attached_modules/base_attached_module.py
@@ -1,13 +1,13 @@
 import logging
 from abc import ABC
 from contextlib import suppress
-from typing import Generic
+from typing import Generic, get_args
 
 from luxonis_ml.utils.registry import AutoRegisterMeta
 from torch import Size, Tensor, nn
 from typing_extensions import TypeVarTuple, Unpack
 
-from luxonis_train.enums import TaskType
+from luxonis_train.enums import Task, TaskType
 from luxonis_train.nodes import BaseNode
 from luxonis_train.utils import IncompatibleException, Labels, Packet
 
@@ -57,19 +57,29 @@ class BaseAttachedModule(
               labels I{or} segmentation labels.
     """
 
-    supported_tasks: list[TaskType | tuple[TaskType, ...]] | None = None
+    supported_tasks: list[Task | tuple[Task, ...]] | None = None
 
     def __init__(self, *, node: BaseNode | None = None):
         super().__init__()
         self._node = node
         self._epoch = 0
 
-        self.required_labels: list[TaskType] = []
-        if self._node and self.supported_tasks:
+        self.required_labels: list[Task] = []
+        if self._node is not None and self.supported_tasks:
+            for tasks in self.supported_tasks:
+                if not isinstance(tasks, tuple):
+                    tasks = (tasks,)
+                for task in tasks:
+                    if isinstance(task, TaskType):
+                        continue
+                    task.name = self.node.metadata_task_override.get(
+                        task.name, task.name
+                    )
+
             module_supported = [
                 label.value
-                if isinstance(label, TaskType)
-                else f"({' + '.join(label)})"
+                if isinstance(label, Task)
+                else f"({' + '.join(map(str, label))})"
                 for label in self.supported_tasks
             ]
             module_supported = f"[{', '.join(module_supported)}]"
@@ -81,7 +91,7 @@ def __init__(self, *, node: BaseNode | None = None):
                 )
             node_tasks = set(self.node.tasks)
             for required_labels in self.supported_tasks:
-                if isinstance(required_labels, TaskType):
+                if isinstance(required_labels, Task):
                     required_labels = [required_labels]
                 else:
                     required_labels = list(required_labels)
@@ -159,7 +169,7 @@ def class_names(self) -> list[str]:
         return self.node.class_names
 
     @property
-    def node_tasks(self) -> list[TaskType]:
+    def node_tasks(self) -> list[Task]:
         """Getter for the tasks of the attached node.
 
         @type: dict[TaskType, str]
@@ -201,11 +211,11 @@ def get_label(
         @raises ValueError: If the module requires multiple labels and the C{task_type} is not provided.
         @raises IncompatibleException: If the label is not found in the labels dictionary.
         """
-        return self._get_label(labels, task_type)[0]
+        return self._get_label(labels, task_type)
 
     def _get_label(
-        self, labels: Labels, task_type: TaskType | None = None
-    ) -> tuple[Tensor, TaskType]:
+        self, labels: Labels, task_type: Task | None = None
+    ) -> Tensor:
         if task_type is None:
             if len(self.required_labels) == 1:
                 task_type = self.required_labels[0]
@@ -221,7 +231,7 @@ def _get_label(
                     f"Available labels: {list(labels.keys())}. "
                     f"Missing label: '{task}'."
                 )
-            return labels[task], task_type
+            return labels[task]
 
         raise ValueError(
             f"{self.name} requires multiple labels. You must provide the "
@@ -229,7 +239,7 @@ def _get_label(
         )
 
     def get_input_tensors(
-        self, inputs: Packet[Tensor], task_type: TaskType | str | None = None
+        self, inputs: Packet[Tensor], task_type: Task | str | None = None
     ) -> list[Tensor]:
         """Extracts the input tensors from the packet.
 
@@ -259,7 +269,7 @@ def get_input_tensors(
             For such cases, the C{prepare} method should be overridden.
         """
         if task_type is not None:
-            if isinstance(task_type, TaskType):
+            if isinstance(task_type, Task):
                 if task_type not in self.node_tasks:
                     raise IncompatibleException(
                         f"Task {task_type.value} is not supported by the node "
@@ -345,24 +355,45 @@ def prepare(
                 set(self.supported_tasks) & set(self.node_tasks)
             )
         x = self.get_input_tensors(inputs)
-        if labels is None or len(labels) == 0:
+        if labels is None or not labels:
             return x, None  # type: ignore
-        label, task_type = self._get_label(labels)
-        if task_type in [TaskType.CLASSIFICATION, TaskType.SEGMENTATION]:
+
+        label = self._get_label(labels)
+        generics = self._get_generic_params()
+        if generics is None or generics[0].__name__ == "Unpack":
+            return x, label  # type: ignore
+
+        if len(generics) != 2:
+            raise RuntimeError(
+                f"The type signature of '{self.name}' implies a complicated "
+                f"custom module ({self.name}[{', '.join(g.__name__ for g in generics)}]). "
+                "Please implement your own `prepare` method. The default "
+                "`prepare` works only when the generic type of the module "
+                "is `[Tensor | list[Tensor], Tensor]`."
+            )
+
+        if generics[0] is Tensor:
             if len(x) == 1:
                 x = x[0]
             else:
                 logger.warning(
-                    f"Module {self.name} expects a single tensor as input, "
+                    f"Module '{self.name}' expects a single tensor as input, "
                     f"but got {len(x)} tensors. Using the last tensor. "
                     f"If this is not the desired behavior, please override the "
                     "`prepare` method of the attached module or the `wrap` "
-                    f"method of {self.node.name}."
+                    f"method of '{self.node.name}'."
                 )
                 x = x[-1]
 
         return x, label  # type: ignore
 
+    def _get_generic_params(self) -> tuple[type, ...] | None:
+        cls = type(self)
+        try:
+            return get_args(cls.__orig_bases__[0])  # type: ignore
+        except Exception:
+            return None
+
     def _check_node_type_override(self) -> None:
         if "node" not in self.__annotations__:
             return
diff --git a/luxonis_train/attached_modules/losses/README.md b/luxonis_train/attached_modules/losses/README.md
index ffe218d4..32d853dc 100644
--- a/luxonis_train/attached_modules/losses/README.md
+++ b/luxonis_train/attached_modules/losses/README.md
@@ -12,8 +12,9 @@ List of all the available loss functions.
 - [`AdaptiveDetectionLoss`](#adaptivedetectionloss)
 - [`EfficientKeypointBBoxLoss`](#efficientkeypointbboxloss)
 - [`FOMOLocalizationLoss`](#fomolocalizationLoss)
-- \[`PrecisionDFLDetectionLoss`\] (# precisiondfldetectionloss)
-- \[`PrecisionDFLSegmentationLoss`\] (# precisiondflsegmentationloss)
+- [Embedding Losses](#embedding-losses)
+- [`PrecisionDFLDetectionLoss`](#precisiondfldetectionloss)
+- [`PrecisionDFLSegmentationLoss`](#precisiondflsegmentationloss)
 
 ## `CrossEntropyLoss`
 
@@ -124,6 +125,48 @@ Adapted from [here](https://arxiv.org/abs/2108.07610).
 | --------------- | ------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `object_weight` | `float` | `500`         | Weight for the objects in the loss calculation. Training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. |
 
+## Embedding Losses
+
+We support the following losses taken from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/):
+
+- [AngularLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#angularloss)
+- [CircleLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#circleloss)
+- [ContrastiveLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#contrastiveloss)
+- [DynamicSoftMarginLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#dynamicsoftmarginloss)
+- [FastAPLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#fastaploss)
+- [HistogramLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#histogramloss)
+- [InstanceLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#instanceloss)
+- [IntraPairVarianceLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#intrapairvarianceloss)
+- [GeneralizedLiftedStructureLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#generalizedliftedstructureloss)
+- [LiftedStructureLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#liftedstructureloss)
+- [MarginLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#marginloss)
+- [MultiSimilarityLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#multisimilarityloss)
+- [NPairsLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#npairsloss)
+- [NCALoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#ncaloss)
+- [NTXentLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#ntxentloss)
+- [PNPLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#pnploss)
+- [RankedListLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#rankedlistloss)
+- [SignalToNoiseRatioContrastiveLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#signaltonoisecontrastiveloss)
+- [SupConLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#supconloss)
+- [ThresholdConsistentMarginLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#thresholdconsistentmarginloss)
+- [TripletMarginLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#tripletmarginloss)
+- [TupletMarginLoss](https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#tupletmarginloss)
+
+**Parameters:**
+
+For loss specific parameters, see the documentation pages linked above. In addition to the loss specific parameters, the following parameters are available:
+
+| Key                  | Type   | Default value | Description                                                                                                                                                                                                                     |
+| -------------------- | ------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `miner`              | `str`  | `None`        | Name of the miner to use with the loss. If `None`, no miner is used. All miners from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/miners/) are supported.                                  |
+| `miner_params`       | `dict` | `None`        | Parameters for the miner.                                                                                                                                                                                                       |
+| `distance`           | `str`  | `None`        | Name of the distance metric to use with the loss. If `None`, no distance metric is used. All distance metrics from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/distances/) are supported. |
+| `distance_params`    | `dict` | `None`        | Parameters for the distance metric.                                                                                                                                                                                             |
+| `reducer`            | `str`  | `None`        | Name of the reducer to use with the loss. If `None`, no reducer is used. All reducers from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/reducers/) are supported.                          |
+| `reducer_params`     | `dict` | `None`        | Parameters for the reducer.                                                                                                                                                                                                     |
+| `regularizer`        | `str`  | `None`        | Name of the regularizer to use with the loss. If `None`, no regularizer is used. All regularizers from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/regularizers/) are supported.          |
+| `regularizer_params` | `dict` | `None`        | Parameters for the regularizer.                                                                                                                                                                                                 |
+
 ## `PrecisionDFLDetectionLoss`
 
 Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
diff --git a/luxonis_train/attached_modules/losses/__init__.py b/luxonis_train/attached_modules/losses/__init__.py
index 32b33174..6c4e882d 100644
--- a/luxonis_train/attached_modules/losses/__init__.py
+++ b/luxonis_train/attached_modules/losses/__init__.py
@@ -3,6 +3,7 @@
 from .bce_with_logits import BCEWithLogitsLoss
 from .cross_entropy import CrossEntropyLoss
 from .efficient_keypoint_bbox_loss import EfficientKeypointBBoxLoss
+from .embedding_losses import EmbeddingLossWrapper
 from .fomo_localization_loss import FOMOLocalizationLoss
 from .ohem_bce_with_logits import OHEMBCEWithLogitsLoss
 from .ohem_cross_entropy import OHEMCrossEntropyLoss
@@ -28,6 +29,7 @@
     "OHEMCrossEntropyLoss",
     "OHEMBCEWithLogitsLoss",
     "FOMOLocalizationLoss",
+    "EmbeddingLossWrapper",
     "PrecisionDFLDetectionLoss",
     "PrecisionDFLSegmentationLoss",
 ]
diff --git a/luxonis_train/attached_modules/losses/embedding_losses.py b/luxonis_train/attached_modules/losses/embedding_losses.py
new file mode 100644
index 00000000..b7b3518e
--- /dev/null
+++ b/luxonis_train/attached_modules/losses/embedding_losses.py
@@ -0,0 +1,134 @@
+import logging
+
+import pytorch_metric_learning.distances as pml_distances
+import pytorch_metric_learning.losses as pml_losses
+import pytorch_metric_learning.miners as pml_miners
+import pytorch_metric_learning.reducers as pml_reducers
+import pytorch_metric_learning.regularizers as pml_regularizers
+from pytorch_metric_learning.losses import CrossBatchMemory
+from torch import Tensor
+
+from luxonis_train.enums import Metadata
+from luxonis_train.nodes.base_node import BaseNode
+from luxonis_train.nodes.heads.ghostfacenet_head import GhostFaceNetHead
+from luxonis_train.utils.types import Kwargs
+
+from .base_loss import BaseLoss
+
+logger = logging.getLogger(__name__)
+
+EMBEDDING_LOSSES = [
+    "AngularLoss",
+    "CircleLoss",
+    "ContrastiveLoss",
+    "DynamicSoftMarginLoss",
+    "FastAPLoss",
+    "HistogramLoss",
+    "InstanceLoss",
+    "IntraPairVarianceLoss",
+    "GeneralizedLiftedStructureLoss",
+    "LiftedStructureLoss",
+    "MarginLoss",
+    "MultiSimilarityLoss",
+    "NPairsLoss",
+    "NCALoss",
+    "NTXentLoss",
+    "PNPLoss",
+    "RankedListLoss",
+    "SignalToNoiseRatioContrastiveLoss",
+    "SupConLoss",
+    "ThresholdConsistentMarginLoss",
+    "TripletMarginLoss",
+    "TupletMarginLoss",
+]
+
+for _loss_name in EMBEDDING_LOSSES:
+
+    class EmbeddingLossWrapper(
+        BaseLoss[Tensor, Tensor], register_name=_loss_name
+    ):
+        node: GhostFaceNetHead
+        supported_tasks = [Metadata("id")]
+        miner: pml_miners.BaseMiner | None
+
+        def __init__(
+            self,
+            *,
+            miner: str | None = None,
+            miner_params: Kwargs | None = None,
+            distance: str | None = None,
+            distance_params: Kwargs | None = None,
+            reducer: str | None = None,
+            reducer_params: Kwargs | None = None,
+            regularizer: str | None = None,
+            regularizer_params: Kwargs | None = None,
+            node: BaseNode | None = None,
+            **kwargs,
+        ):
+            super().__init__(node=node)
+            loss_name = _loss_name  # noqa: B023
+
+            if not hasattr(pml_losses, loss_name):
+                raise ValueError(
+                    f"Loss {loss_name} not found in pytorch-metric-learning"
+                )
+            Loss = getattr(pml_losses, loss_name)
+
+            if reducer is not None:
+                if not hasattr(pml_reducers, reducer):
+                    raise ValueError(
+                        f"Reducer {reducer} not found in pytorch-metric-learning"
+                    )
+                Reducer = getattr(pml_reducers, reducer)
+                kwargs["reducer"] = Reducer(**(reducer_params or {}))
+            if regularizer is not None:
+                if not hasattr(pml_regularizers, regularizer):
+                    raise ValueError(
+                        f"Regularizer {regularizer} not found in pytorch-metric-learning"
+                    )
+                Regularizer = getattr(pml_regularizers, regularizer)
+                kwargs["embedding_regularizer"] = Regularizer(
+                    **(regularizer_params or {})
+                )
+            if distance is not None:
+                if not hasattr(pml_distances, distance):
+                    raise ValueError(
+                        f"Distance {distance} not found in pytorch-metric-learning"
+                    )
+                Distance = getattr(pml_distances, distance)
+                kwargs["distance"] = Distance(**(distance_params or {}))
+
+            if miner is not None:
+                if not hasattr(pml_miners, miner):
+                    raise ValueError(
+                        f"Miner {miner} not found in pytorch-metric-learning"
+                    )
+                Miner = getattr(pml_miners, miner)
+                self.miner = Miner(**(miner_params or {}))
+            else:
+                self.miner = None
+
+            self.loss = Loss(**kwargs)
+
+            if self.node.cross_batch_memory_size is not None:
+                if loss_name in CrossBatchMemory.supported_losses():
+                    self.loss = CrossBatchMemory(
+                        self.loss,
+                        embedding_size=self.node.embedding_size,
+                        miner=self.miner,
+                    )
+                else:
+                    logger.warning(
+                        f"'CrossBatchMemory' is not supported for {loss_name}. "
+                        "Ignoring cross_batch_memory_size."
+                    )
+
+        def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
+            if self.miner is not None:
+                hard_pairs = self.miner(inputs, target)
+                return self.loss(inputs, target, hard_pairs)
+            return self.loss(inputs, target)
+
+        @property
+        def name(self) -> str:
+            return _loss_name  # noqa: B023
diff --git a/luxonis_train/attached_modules/metrics/README.md b/luxonis_train/attached_modules/metrics/README.md
index 42f42fcb..59021576 100644
--- a/luxonis_train/attached_modules/metrics/README.md
+++ b/luxonis_train/attached_modules/metrics/README.md
@@ -8,6 +8,8 @@ List of all the available metrics.
 - [ObjectKeypointSimilarity](#objectkeypointsimilarity)
 - [MeanAveragePrecision](#meanaverageprecision)
 - [MeanAveragePrecisionKeypoints](#meanaverageprecisionkeypoints)
+- [ClosestIsPositiveAccuracy](#closestispositiveaccuracy)
+- [MedianDistances](#mediandistances)
 
 ## Torchmetrics
 
@@ -63,3 +65,13 @@ Evaluation leverages COCO evaluation framework (COCOeval) to assess mAP performa
 | `area_factor` | `float`                             | `0.53`        | Factor by which to multiply the bounding box area                     |
 | `max_dets`    | `int`                               | `20`          | Maximum number of detections per image                                |
 | `box_fotmat`  | `Literal["xyxy", "xywh", "cxcywh"]` | `"xyxy"`      | Format of the bounding boxes                                          |
+
+## ClosestIsPositiveAccuracy
+
+Compute the accuracy of the closest positive sample to the query sample.
+Needs to be connected to the `GhostFaceNetHead` node.
+
+## MedianDistances
+
+Compute the median distance between the query and the positive samples.
+Needs to be connected to the `GhostFaceNetHead` node.
diff --git a/luxonis_train/attached_modules/metrics/__init__.py b/luxonis_train/attached_modules/metrics/__init__.py
index cdd0b3ac..df72a785 100644
--- a/luxonis_train/attached_modules/metrics/__init__.py
+++ b/luxonis_train/attached_modules/metrics/__init__.py
@@ -1,5 +1,6 @@
 from .base_metric import BaseMetric
 from .confusion_matrix import ConfusionMatrix
+from .embedding_metrics import ClosestIsPositiveAccuracy, MedianDistances
 from .mean_average_precision import MeanAveragePrecision
 from .mean_average_precision_keypoints import MeanAveragePrecisionKeypoints
 from .object_keypoint_similarity import ObjectKeypointSimilarity
@@ -15,5 +16,7 @@
     "ObjectKeypointSimilarity",
     "Precision",
     "Recall",
+    "ClosestIsPositiveAccuracy",
     "ConfusionMatrix",
+    "MedianDistances",
 ]
diff --git a/luxonis_train/attached_modules/metrics/embedding_metrics.py b/luxonis_train/attached_modules/metrics/embedding_metrics.py
new file mode 100644
index 00000000..b09d42f6
--- /dev/null
+++ b/luxonis_train/attached_modules/metrics/embedding_metrics.py
@@ -0,0 +1,208 @@
+import torch
+from torch import Tensor
+
+from luxonis_train.enums import Metadata
+from luxonis_train.nodes.heads.ghostfacenet_head import GhostFaceNetHead
+
+from .base_metric import BaseMetric
+
+# Converted from https://omoindrot.github.io/triplet-loss#offline-and-online-triplet-mining
+# to PyTorch from TensorFlow
+
+
+class ClosestIsPositiveAccuracy(BaseMetric[Tensor, Tensor]):
+    supported_tasks = [Metadata("id")]
+    node: GhostFaceNetHead
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cross_batch_memory_size = self.node.cross_batch_memory_size
+        self.add_state("cross_batch_memory", default=[], dist_reduce_fx="cat")
+        self.add_state(
+            "correct_predictions",
+            default=torch.tensor(0),
+            dist_reduce_fx="sum",
+        )
+        self.add_state(
+            "total_predictions", default=torch.tensor(0), dist_reduce_fx="sum"
+        )
+
+    def update(self, inputs: Tensor, target: Tensor):
+        embeddings, labels = inputs, target
+
+        if self.cross_batch_memory_size is not None:
+            self.cross_batch_memory.extend(list(zip(embeddings, labels)))
+
+            if len(self.cross_batch_memory) > self.cross_batch_memory_size:
+                self.cross_batch_memory = self.cross_batch_memory[
+                    -self.cross_batch_memory_size :
+                ]
+
+            if len(self.cross_batch_memory) < self.cross_batch_memory_size:
+                return
+
+            embeddings, labels = zip(*self.cross_batch_memory)
+            embeddings = torch.stack(embeddings)
+            labels = torch.stack(labels)
+
+        pairwise_distances = _pairwise_distances(embeddings)
+        pairwise_distances.fill_diagonal_(float("inf"))
+
+        closest_indices = torch.argmin(pairwise_distances, dim=1)
+        closest_labels = labels[closest_indices]
+
+        positive_mask = _get_anchor_positive_triplet_mask(labels)
+        num_positives = positive_mask.sum(dim=1)
+        has_at_least_one_positive_and_negative = (num_positives > 0) & (
+            num_positives < len(labels)
+        )
+
+        filtered_labels = labels[has_at_least_one_positive_and_negative]
+        filtered_closest_labels = closest_labels[
+            has_at_least_one_positive_and_negative
+        ]
+
+        correct_predictions = (
+            filtered_labels == filtered_closest_labels
+        ).sum()
+
+        self.correct_predictions += correct_predictions
+        self.total_predictions += len(filtered_labels)
+
+    def compute(self):
+        return self.correct_predictions / self.total_predictions
+
+
+class MedianDistances(BaseMetric[Tensor, Tensor]):
+    supported_tasks = [Metadata("id")]
+    node: GhostFaceNetHead
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cross_batch_memory_size = self.node.cross_batch_memory_size
+        self.add_state("cross_batch_memory", default=[], dist_reduce_fx="cat")
+        self.add_state("all_distances", default=[], dist_reduce_fx="cat")
+        self.add_state("closest_distances", default=[], dist_reduce_fx="cat")
+        self.add_state("positive_distances", default=[], dist_reduce_fx="cat")
+        self.add_state(
+            "closest_vs_positive_distances", default=[], dist_reduce_fx="cat"
+        )
+
+    def update(self, inputs: Tensor, target: Tensor):
+        embeddings, labels = inputs, target
+
+        if self.cross_batch_memory_size is not None:
+            self.cross_batch_memory.extend(list(zip(embeddings, labels)))
+
+            if len(self.cross_batch_memory) > self.cross_batch_memory_size:
+                self.cross_batch_memory = self.cross_batch_memory[
+                    -self.cross_batch_memory_size :
+                ]
+
+            if len(self.cross_batch_memory) < self.cross_batch_memory_size:
+                return
+
+            embeddings, labels = zip(*self.cross_batch_memory)
+            embeddings = torch.stack(embeddings)
+            labels = torch.stack(labels)
+
+        pairwise_distances = _pairwise_distances(embeddings)
+        self.all_distances.append(
+            pairwise_distances[
+                torch.triu(torch.ones_like(pairwise_distances), diagonal=1)
+                == 1
+            ].flatten()
+        )
+
+        pairwise_distances.fill_diagonal_(float("inf"))
+
+        closest_distances, _ = torch.min(pairwise_distances, dim=1)
+        self.closest_distances.append(closest_distances)
+
+        positive_mask = _get_anchor_positive_triplet_mask(labels).bool()
+
+        only_positive_distances = pairwise_distances.clone()
+        only_positive_distances[~positive_mask] = float("inf")
+
+        closest_positive_distances, _ = torch.min(
+            only_positive_distances, dim=1
+        )
+
+        non_inf_mask = closest_positive_distances != float("inf")
+        difference = closest_positive_distances - closest_distances
+        difference = difference[non_inf_mask]
+
+        self.closest_vs_positive_distances.append(difference)
+        self.positive_distances.append(
+            closest_positive_distances[non_inf_mask]
+        )
+
+    def compute(self):
+        if len(self.all_distances) == 0:
+            return {
+                "MedianDistance": torch.tensor(float("nan")),
+                "MedianClosestDistance": torch.tensor(float("nan")),
+                "MedianClosestPositiveDistance": torch.tensor(float("nan")),
+                "MedianClosestVsClosestPositiveDistance": torch.tensor(
+                    float("nan")
+                ),
+            }
+
+        all_distances = torch.cat(self.all_distances)
+        closest_distances = torch.cat(self.closest_distances)
+        positive_distances = torch.cat(self.positive_distances)
+        closest_vs_positive_distances = torch.cat(
+            self.closest_vs_positive_distances
+        )
+
+        return {
+            "MedianDistance": torch.median(all_distances),
+            "MedianClosestDistance": torch.median(closest_distances),
+            "MedianClosestPositiveDistance": torch.median(positive_distances),
+            "MedianClosestVsClosestPositiveDistance": torch.median(
+                closest_vs_positive_distances
+            ),
+        }
+
+
+def _pairwise_distances(embeddings, squared=False):
+    """Compute the 2D matrix of distances between all the embeddings.
+
+    @param embeddings: tensor of shape (batch_size, embed_dim)
+    @type embeddings: torch.Tensor
+    @param squared: If true, output is the pairwise squared euclidean
+        distance matrix. If false, output is the pairwise euclidean
+        distance matrix.
+    @type squared: bool
+    @return: pairwise_distances: tensor of shape (batch_size,
+        batch_size)
+    @rtype: torch.Tensor
+    """
+    dot_product = torch.matmul(embeddings, embeddings.t())
+
+    square_norm = torch.diag(dot_product)
+
+    distances = (
+        square_norm.unsqueeze(0) - 2.0 * dot_product + square_norm.unsqueeze(1)
+    )
+    distances = torch.max(distances, torch.tensor(0.0))
+
+    if not squared:
+        mask = (distances == 0.0).float()
+        distances = distances + mask * 1e-16
+
+        distances = torch.sqrt(distances)
+
+        distances = distances * (1.0 - mask)
+
+    return distances
+
+
+def _get_anchor_positive_triplet_mask(labels):
+    indices_equal = torch.eye(
+        labels.shape[0], dtype=torch.uint8, device=labels.device
+    )
+    indices_not_equal = ~indices_equal
+    labels_equal = labels.unsqueeze(0) == labels.unsqueeze(1)
+    mask = indices_not_equal & labels_equal
+    return mask
diff --git a/luxonis_train/attached_modules/metrics/torchmetrics.py b/luxonis_train/attached_modules/metrics/torchmetrics.py
index c222cb78..553ce31c 100644
--- a/luxonis_train/attached_modules/metrics/torchmetrics.py
+++ b/luxonis_train/attached_modules/metrics/torchmetrics.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger(__name__)
 
 
-class TorchMetricWrapper(BaseMetric[Tensor]):
+class TorchMetricWrapper(BaseMetric[Tensor, Tensor]):
     Metric: type[torchmetrics.Metric]
 
     def __init__(self, **kwargs: Any):
diff --git a/luxonis_train/attached_modules/visualizers/README.md b/luxonis_train/attached_modules/visualizers/README.md
index 03daa87f..afef5066 100644
--- a/luxonis_train/attached_modules/visualizers/README.md
+++ b/luxonis_train/attached_modules/visualizers/README.md
@@ -7,6 +7,8 @@ Visualizers are used to render the output of a node. They are used in the `visua
 - [`BBoxVisualizer`](#bboxvisualizer)
 - [`ClassificationVisualizer`](#classificationvisualizer)
 - [`KeypointVisualizer`](#keypointvisualizer)
+- [`SegmentationVisualizer`](#segmentationvisualizer)
+- [`EmbeddingsVisualizer`](#embeddingsvisualizer)
 - [`MultiVisualizer`](#multivisualizer)
 
 ## `BBoxVisualizer`
@@ -72,6 +74,14 @@ Visualizer for bounding boxes.
 
 ![class_viz_example](https://github.com/luxonis/luxonis-train/blob/main/media/example_viz/class.png)
 
+## `EmbeddingsVisualizer`
+
+**Parameters:**
+
+| Key                 | Type    | Default value | Description                                                                                                                   |
+| ------------------- | ------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `z_score_threshold` | `float` | `3.0`         | Threshold for z-score filtering. Embeddings with z-score higher than this value are considered as outliers and are not drawn. |
+
 ## `MultiVisualizer`
 
 Special type of meta-visualizer that combines several visualizers into one. The combined visualizers share canvas.
diff --git a/luxonis_train/attached_modules/visualizers/__init__.py b/luxonis_train/attached_modules/visualizers/__init__.py
index 1bd65f50..7389aa57 100644
--- a/luxonis_train/attached_modules/visualizers/__init__.py
+++ b/luxonis_train/attached_modules/visualizers/__init__.py
@@ -1,6 +1,7 @@
 from .base_visualizer import BaseVisualizer
 from .bbox_visualizer import BBoxVisualizer
 from .classification_visualizer import ClassificationVisualizer
+from .embeddings_visualizer import EmbeddingsVisualizer
 from .instance_segmentation_visualizer import InstanceSegmentationVisualizer
 from .keypoint_visualizer import KeypointVisualizer
 from .multi_visualizer import MultiVisualizer
@@ -24,6 +25,7 @@
     "KeypointVisualizer",
     "MultiVisualizer",
     "SegmentationVisualizer",
+    "EmbeddingsVisualizer",
     "InstanceSegmentationVisualizer",
     "combine_visualizations",
     "draw_bounding_box_labels",
diff --git a/luxonis_train/attached_modules/visualizers/embeddings_visualizer.py b/luxonis_train/attached_modules/visualizers/embeddings_visualizer.py
new file mode 100644
index 00000000..da483705
--- /dev/null
+++ b/luxonis_train/attached_modules/visualizers/embeddings_visualizer.py
@@ -0,0 +1,131 @@
+import logging
+from collections.abc import Callable
+
+import numpy as np
+import seaborn as sns
+from luxonis_ml.data.utils import ColorMap
+from matplotlib import pyplot as plt
+from sklearn.decomposition import PCA
+from torch import Tensor
+
+from luxonis_train.enums import Metadata
+
+from .base_visualizer import BaseVisualizer
+from .utils import figure_to_torch
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingsVisualizer(BaseVisualizer[Tensor, Tensor]):
+    supported_tasks = [Metadata("id")]
+
+    def __init__(self, z_score_threshold: float = 3, **kwargs):
+        """Visualizer for embedding tasks like reID.
+
+        @type accumulate_n_batches: int
+        @param accumulate_n_batches: Number of batches to accumulate
+            before visualizing.
+        """
+        super().__init__(**kwargs)
+        self.colors = ColorMap()
+        self.z_score_threshold = z_score_threshold
+
+    def forward(
+        self,
+        label_canvas: Tensor,
+        prediction_canvas: Tensor,
+        embeddings: Tensor,
+        ids: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        """Creates a visualization of the embeddings.
+
+        @type label_canvas: Tensor
+        @param label_canvas: The canvas to draw the labels on.
+        @type prediction_canvas: Tensor
+        @param prediction_canvas: The canvas to draw the predictions on.
+        @type embeddings: Tensor
+        @param embeddings: The embeddings to visualize.
+        @type ids: Tensor
+        @param ids: The ids to visualize.
+        @rtype: Tensor
+        @return: An embedding space projection.
+        """
+
+        embeddings_np = embeddings.detach().cpu().numpy()
+        ids_np = ids.detach().cpu().numpy().astype(int)
+
+        pca = PCA(n_components=2)
+        embeddings_2d = pca.fit_transform(embeddings_np)
+        embeddings_2d, ids_np = self._filter_outliers(embeddings_2d, ids_np)
+
+        kdeplot = self.plot_to_tensor(embeddings_2d, ids_np, self.kde_plot)
+        scatterplot = self.plot_to_tensor(
+            embeddings_2d, ids_np, self.scatter_plot
+        )
+
+        return kdeplot, scatterplot
+
+    def _get_color(self, label: int) -> tuple[float, float, float]:
+        r, g, b = self.colors[label]
+        return r / 255, g / 255, b / 255
+
+    def _filter_outliers(
+        self, points: np.ndarray, ids: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray]:
+        mean = np.mean(points, axis=0)
+        std_dev = np.std(points, axis=0)
+        z_scores = (points - mean) / std_dev
+
+        mask = (np.abs(z_scores) < self.z_score_threshold).all(axis=1)
+        logger.info(f"Filtered out {len(points) - mask.sum()} outliers")
+        return points[mask], ids[mask]
+
+    @staticmethod
+    def plot_to_tensor(
+        embeddings_2d: np.ndarray,
+        ids_np: np.ndarray,
+        plot_func: Callable[[plt.Axes, np.ndarray, np.ndarray], None],
+    ) -> Tensor:
+        fig, ax = plt.subplots(figsize=(10, 10))
+        ax.set_xlim(embeddings_2d[:, 0].min(), embeddings_2d[:, 0].max())
+        ax.set_ylim(embeddings_2d[:, 1].min(), embeddings_2d[:, 1].max())
+
+        plot_func(ax, embeddings_2d, ids_np)
+        ax.axis("off")
+
+        tensor_image = figure_to_torch(fig, width=512, height=512).unsqueeze(0)
+        plt.close(fig)
+        return tensor_image
+
+    def kde_plot(
+        self, ax: plt.Axes, emb: np.ndarray, labels: np.ndarray
+    ) -> None:
+        for label in np.unique(labels):
+            subset = emb[labels == label]
+            color = self._get_color(label)
+            sns.kdeplot(
+                x=subset[:, 0],
+                y=subset[:, 1],
+                color=color,
+                alpha=0.9,
+                bw_adjust=1.5,
+                fill=True,
+                warn_singular=False,
+                ax=ax,
+            )
+
+    def scatter_plot(
+        self, ax: plt.Axes, emb: np.ndarray, labels: np.ndarray
+    ) -> None:
+        unique_labels = np.unique(labels)
+        palette = {lbl: self._get_color(lbl) for lbl in unique_labels}
+        sns.scatterplot(
+            x=emb[:, 0],
+            y=emb[:, 1],
+            hue=labels,
+            palette=palette,
+            alpha=0.9,
+            s=300,
+            legend=False,
+            ax=ax,
+        )
diff --git a/luxonis_train/attached_modules/visualizers/utils.py b/luxonis_train/attached_modules/visualizers/utils.py
index 1a571eca..ac95046b 100644
--- a/luxonis_train/attached_modules/visualizers/utils.py
+++ b/luxonis_train/attached_modules/visualizers/utils.py
@@ -427,6 +427,6 @@ def resize_to_match(
         case _:
             raise ValueError(
                 "Visualization should be either a single tensor or a tuple of "
-                "two tensors or a tuple of a tensor and a list of tensors."
+                "two tensors or a tuple of a tensor and a list of tensors. "
                 f"Got: `{type(visualization)}`."
             )
diff --git a/luxonis_train/callbacks/ema.py b/luxonis_train/callbacks/ema.py
index 63166ad3..a5ecd995 100644
--- a/luxonis_train/callbacks/ema.py
+++ b/luxonis_train/callbacks/ema.py
@@ -36,7 +36,7 @@ def __init__(
         @type decay_tau: float
         @param decay_tau: Decay tau for the moving average.
         """
-        super(ModelEma, self).__init__()
+        super().__init__()
         model.eval()
         self.state_dict_ema = deepcopy(model.state_dict())
         model.train()
diff --git a/luxonis_train/config/config.py b/luxonis_train/config/config.py
index 159a39fa..fa770505 100644
--- a/luxonis_train/config/config.py
+++ b/luxonis_train/config/config.py
@@ -67,6 +67,7 @@ class ModelNodeConfig(BaseModelExtraForbid):
     freezing: FreezingConfig = FreezingConfig()
     remove_on_export: bool = False
     task_name: str = ""
+    metadata_task_override: str | dict[str, str] | None = None
     params: Params = {}
 
 
@@ -98,13 +99,14 @@ def validate_nodes(cls, nodes: Any) -> Any:
         names = []
         last_body_index: int | None = None
         for i, node in enumerate(nodes):
-            name = node.get("alias", node.get("name"))
+            name = node.get("name")
             if name is None:
                 raise ValueError(
                     f"Node {i} does not specify the `name` field."
                 )
             if "Head" in name and last_body_index is None:
                 last_body_index = i - 1
+            name = node.get("alias") or name
             names.append(name)
             if i > 0 and "inputs" not in node and "input_sources" not in node:
                 if last_body_index is not None:
@@ -243,7 +245,7 @@ def check_attached_modules(cls, data: Params) -> Params:
             else:
                 warnings.warn(
                     f"Field `model.{section}` is deprecated. "
-                    f"Please specify `{section}`under "
+                    f"Please specify `{section}` under "
                     "the node they are attached to."
                 )
             for node in data["nodes"]:
diff --git a/luxonis_train/enums.py b/luxonis_train/enums.py
index 09d38fb2..d82e8378 100644
--- a/luxonis_train/enums.py
+++ b/luxonis_train/enums.py
@@ -1,12 +1,27 @@
+from dataclasses import dataclass
 from enum import Enum
+from typing import TypeAlias
 
 
 class TaskType(str, Enum):
-    """Tasks supported by nodes in LuxonisTrain."""
-
     CLASSIFICATION = "classification"
     SEGMENTATION = "segmentation"
     INSTANCE_SEGMENTATION = "instance_segmentation"
     BOUNDINGBOX = "boundingbox"
     KEYPOINTS = "keypoints"
     ARRAY = "array"
+
+
+@dataclass(unsafe_hash=True)
+class Metadata:
+    name: str
+
+    @property
+    def value(self):
+        return f"metadata/{self.name}"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+Task: TypeAlias = TaskType | Metadata
diff --git a/luxonis_train/loaders/base_loader.py b/luxonis_train/loaders/base_loader.py
index db97ac00..b0c7a79e 100644
--- a/luxonis_train/loaders/base_loader.py
+++ b/luxonis_train/loaders/base_loader.py
@@ -249,6 +249,11 @@ def get_n_keypoints(self) -> dict[str, int] | None:
         """
         return None
 
+    def get_metadata_types(
+        self,
+    ) -> dict[str, dict[str, type[int] | type[float] | type[str]]]:
+        return {}
+
     def dict_numpy_to_torch(
         self, numpy_dictionary: dict[str, np.ndarray]
     ) -> dict[str, Tensor]:
@@ -260,10 +265,14 @@ def dict_numpy_to_torch(
         @rtype: dict[str, torch.Tensor]
         @return: Dictionary of torch tensors.
         """
-        return {
-            task: torch.tensor(array).float()
-            for task, array in numpy_dictionary.items()
-        }
+        torch_dictionary = {}
+
+        for task, array in numpy_dictionary.items():
+            if array.dtype.kind in "U":
+                array = np.array([ord(c) for c in array[0]], dtype=np.int32)
+            torch_dictionary[task] = torch.tensor(array, dtype=torch.float32)
+
+        return torch_dictionary
 
     def read_image(self, path: str) -> npt.NDArray[np.float32]:
         """Reads an image from a file.
diff --git a/luxonis_train/loaders/luxonis_loader_torch.py b/luxonis_train/loaders/luxonis_loader_torch.py
index 4267cced..531b4b1f 100644
--- a/luxonis_train/loaders/luxonis_loader_torch.py
+++ b/luxonis_train/loaders/luxonis_loader_torch.py
@@ -127,6 +127,15 @@ def get_n_keypoints(self) -> dict[str, int]:
         skeletons = self.dataset.get_skeletons()
         return {task: len(skeletons[task][0]) for task in skeletons}
 
+    @override
+    def get_metadata_types(
+        self,
+    ) -> dict[str, dict[str, type[int] | type[float] | type[str]]]:
+        return {
+            k: {"float": float, "int": int, "str": str, "Category": int}[v]
+            for k, v in self.dataset.get_metadata_types().items()
+        }
+
     def augment_test_image(self, img: Tensor) -> Tensor:
         if self.loader.augmentations is None:
             return img
diff --git a/luxonis_train/loaders/utils.py b/luxonis_train/loaders/utils.py
index aed4df94..aa6b9fb4 100644
--- a/luxonis_train/loaders/utils.py
+++ b/luxonis_train/loaders/utils.py
@@ -1,5 +1,5 @@
 import torch
-from luxonis_ml.data.utils import get_task_type
+from luxonis_ml.data.utils import get_task_type, task_is_metadata
 from torch import Tensor
 
 from luxonis_train.utils.types import Labels
@@ -39,15 +39,13 @@ def collate_fn(
             label_box: list[Tensor] = []
             for i, ann in enumerate(annos):
                 new_ann = torch.zeros((ann.shape[0], ann.shape[1] + 1))
-                # add target image index for build_targets()
+                # add batch index to separate boxes from different images
                 new_ann[:, 0] = i
                 new_ann[:, 1:] = ann
                 label_box.append(new_ann)
             out_labels[task] = torch.cat(label_box, 0)
-
-        elif task_type == "instance_segmentation":
-            masks = [label[task] for label in labels]
-            out_labels[task] = torch.cat(masks, 0)
+        elif task_type == "instance_segmentation" or task_is_metadata(task):
+            out_labels[task] = torch.cat(annos, 0)
         else:
             out_labels[task] = torch.stack(annos, 0)
 
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index 17aea732..2b7252c8 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -161,7 +161,7 @@ def __init__(
             dict
         )
 
-        self._logged_images = 0
+        self._logged_images = defaultdict(int)
 
         frozen_nodes: list[tuple[str, int]] = []
         nodes: dict[str, tuple[type[BaseNode], Kwargs]] = {}
@@ -191,13 +191,13 @@ def __init__(
                         f"Node {node_name} does not have the `task_name` parameter set. "
                         "Please specify the `task_name` parameter for each head node. "
                     )
-
             nodes[node_name] = (
                 Node,
                 {
                     **node_cfg.params,
                     "task_name": node_cfg.task_name,
                     "remove_on_export": node_cfg.remove_on_export,
+                    "metadata_task_override": node_cfg.metadata_task_override,
                 },
             )
 
@@ -312,15 +312,10 @@ def _initiate_nodes(
             for source_name, shape in shapes.items()
         }
 
-        for (
-            node_name,
-            (
-                Node,
-                node_kwargs,
-            ),
-            node_input_names,
-            _,
-        ) in traverse_graph(self.graph, nodes):
+        for node_name, (
+            Node,
+            node_kwargs,
+        ), node_input_names, _ in traverse_graph(self.graph, nodes):
             node_dummy_inputs: list[Packet[Tensor]] = []
             """List of dummy input packets for the node.
 
@@ -774,8 +769,13 @@ def _evaluation_step(
     ) -> dict[str, Tensor]:
         inputs, labels = batch
         images = None
-        if self._logged_images < self.cfg.trainer.n_log_images:
+        if not self._logged_images:
             images = get_denormalized_images(self.cfg, inputs)
+        for value in self._logged_images.values():
+            if value < self.cfg.trainer.n_log_images:
+                images = get_denormalized_images(self.cfg, inputs)
+                break
+
         outputs = self.forward(
             inputs,
             labels,
@@ -790,17 +790,16 @@ def _evaluation_step(
         logged_images = self._logged_images
         for node_name, visualizations in outputs.visualizations.items():
             for viz_name, viz_batch in visualizations.items():
-                logged_images = self._logged_images
                 for viz in viz_batch:
-                    if logged_images >= self.cfg.trainer.n_log_images:
-                        break
+                    name = f"{mode}/visualizations/{node_name}/{viz_name}"
+                    if logged_images[name] >= self.cfg.trainer.n_log_images:
+                        continue
                     self.logger.log_image(
-                        f"{mode}/visualizations/{node_name}/{viz_name}/{logged_images}",
+                        f"{name}/{logged_images[name]}",
                         viz.detach().cpu().numpy().transpose(1, 2, 0),
                         step=self.current_epoch,
                     )
-                    logged_images += 1
-        self._logged_images = logged_images
+                    logged_images[name] += 1
 
         return step_output
 
@@ -840,7 +839,7 @@ def _evaluation_epoch_end(self, mode: Literal["test", "val"]) -> None:
             )
 
         self.validation_step_outputs.clear()
-        self._logged_images = 0
+        self._logged_images.clear()
 
     def configure_callbacks(self) -> list[pl.Callback]:
         """Configures Pytorch Lightning callbacks."""
diff --git a/luxonis_train/nodes/README.md b/luxonis_train/nodes/README.md
index ab139d04..3d881ddf 100644
--- a/luxonis_train/nodes/README.md
+++ b/luxonis_train/nodes/README.md
@@ -18,6 +18,7 @@ arbitrarily as long as the two nodes are compatible with each other. We've group
   - [`DDRNet`](#ddrnet)
   - [`RecSubNet`](#recsubnet)
   - [`EfficientViT`](#efficientvit)
+  - [`GhostFaceNetV2`](#ghostfacenetv2)
 - [Necks](#necks)
   - [`RepPANNeck`](#reppanneck)
 - [Heads](#heads)
@@ -29,9 +30,11 @@ arbitrarily as long as the two nodes are compatible with each other. We've group
   - [`DDRNetSegmentationHead`](#ddrnetsegmentationhead)
   - [`DiscSubNetHead`](#discsubnet)
   - [`FOMOHead`](#fomohead)
+  - [`GhostFaceNetHead`](#ghostfacenethead)
   - [`PrecisionBBoxHead`](#precisionbboxhead)
   - [`PrecisionSegmentBBoxHead`](#precisionsegmentbboxhead)
-    Every node takes these parameters:
+
+Every node takes these parameters:
 
 | Key                | Type          | Default value | Description                                                                 |
 | ------------------ | ------------- | ------------- | --------------------------------------------------------------------------- |
@@ -188,6 +191,14 @@ Adapted from [here](https://arxiv.org/abs/2205.14756)
 | `expand_ratio` | `int`                                                             | `4`                              | Factor by which channels expand in the local module |
 | `dim`          | `int`                                                             | `None`                           | Dimension size for each attention head              |
 
+### `GhostFaceNetV2`
+
+**Parameters:**
+
+| Key       | Type            | Default value | Description                 |
+| --------- | --------------- | ------------- | --------------------------- |
+| `variant` | `Literal["V2"]` | `"V2"`        | The variant of the network. |
+
 ## Neck
 
 ### `RepPANNeck`
@@ -293,7 +304,15 @@ Adapted from [here](https://arxiv.org/abs/2108.07610).
 | `conv_channels`   | `int`  | `16`          | Number of output channels for each convolutional layer.                                  |
 | `use_nms`         | `bool` | `False`       | If True, enable NMS. This can reduce FP, but it will also reduce TP for close neighbors. |
 
-## `PrecisionBBoxHead`
+### `GhostFaceNetHead`
+
+**Parameters:**
+
+| Key              | Type  | Default value | Description                              |
+| ---------------- | ----- | ------------- | ---------------------------------------- |
+| `embedding_size` | `int` | `512`         | The size of the output embedding vector. |
+
+### `PrecisionBBoxHead`
 
 Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
 
@@ -307,7 +326,7 @@ Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arx
 | `iou_thres`  | `float` | `0.45`        | IoU threshold for non-maxima-suppression (used for evaluation)            |
 | `max_det`    | `int`   | `300`         | Max number of detections for non-maxima-suppression (used for evaluation) |
 
-## `PrecisionSegmentBBoxHead`
+### `PrecisionSegmentBBoxHead`
 
 Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
 
diff --git a/luxonis_train/nodes/backbones/__init__.py b/luxonis_train/nodes/backbones/__init__.py
index 441086b7..520296e8 100644
--- a/luxonis_train/nodes/backbones/__init__.py
+++ b/luxonis_train/nodes/backbones/__init__.py
@@ -3,6 +3,7 @@
 from .efficientnet import EfficientNet
 from .efficientrep import EfficientRep
 from .efficientvit import EfficientViT
+from .ghostfacenet import GhostFaceNetV2
 from .micronet import MicroNet
 from .mobilenetv2 import MobileNetV2
 from .mobileone import MobileOne
@@ -23,5 +24,6 @@
     "ResNet",
     "DDRNet",
     "RecSubNet",
+    "GhostFaceNetV2",
     "EfficientViT",
 ]
diff --git a/luxonis_train/nodes/backbones/ddrnet/ddrnet.py b/luxonis_train/nodes/backbones/ddrnet/ddrnet.py
index 2698c26d..6bee5dfc 100644
--- a/luxonis_train/nodes/backbones/ddrnet/ddrnet.py
+++ b/luxonis_train/nodes/backbones/ddrnet/ddrnet.py
@@ -148,7 +148,7 @@ def __init__(
                     out_channels=highres_channels,
                     kernel_size=1,
                     bias=False,
-                    activation=nn.Identity(),
+                    activation=False,
                 )
             )
             self.down3.append(
@@ -159,7 +159,7 @@ def __init__(
                     stride=2,
                     padding=1,
                     bias=False,
-                    activation=nn.Identity(),
+                    activation=False,
                 )
             )
             self.layer3_skip.append(
@@ -180,7 +180,7 @@ def __init__(
             out_channels=highres_channels,
             kernel_size=1,
             bias=False,
-            activation=nn.Identity(),
+            activation=False,
         )
 
         self.down4 = nn.Sequential(
@@ -200,7 +200,7 @@ def __init__(
                 stride=2,
                 padding=1,
                 bias=False,
-                activation=nn.Identity(),
+                activation=False,
             ),
         )
 
diff --git a/luxonis_train/nodes/backbones/ghostfacenet/__init__.py b/luxonis_train/nodes/backbones/ghostfacenet/__init__.py
new file mode 100644
index 00000000..c24d9afb
--- /dev/null
+++ b/luxonis_train/nodes/backbones/ghostfacenet/__init__.py
@@ -0,0 +1,3 @@
+from .ghostfacenet import GhostFaceNetV2
+
+__all__ = ["GhostFaceNetV2"]
diff --git a/luxonis_train/nodes/backbones/ghostfacenet/blocks.py b/luxonis_train/nodes/backbones/ghostfacenet/blocks.py
new file mode 100644
index 00000000..118d61ac
--- /dev/null
+++ b/luxonis_train/nodes/backbones/ghostfacenet/blocks.py
@@ -0,0 +1,188 @@
+import math
+from typing import Literal
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from luxonis_train.nodes.backbones.micronet.blocks import _make_divisible
+from luxonis_train.nodes.blocks import SqueezeExciteBlock
+from luxonis_train.nodes.blocks.blocks import ConvModule
+
+
+class GhostModuleV2(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["original", "attn"],
+        kernel_size: int = 1,
+        ratio: int = 2,
+        dw_size: int = 3,
+        stride: int = 1,
+        use_prelu: bool = True,
+    ):
+        super().__init__()
+        self.mode = mode
+        self.out_channels = out_channels
+        intermediate_channels = math.ceil(out_channels / ratio)
+        new_channels = intermediate_channels * (ratio - 1)
+        self.primary_conv = ConvModule(
+            in_channels,
+            intermediate_channels,
+            kernel_size,
+            stride,
+            kernel_size // 2,
+            activation=nn.PReLU() if use_prelu else False,
+        )
+        self.cheap_operation = ConvModule(
+            intermediate_channels,
+            new_channels,
+            dw_size,
+            1,
+            dw_size // 2,
+            groups=intermediate_channels,
+            activation=nn.PReLU() if use_prelu else False,
+        )
+
+        if self.mode == "attn":
+            self.short_conv = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,
+                    kernel_size // 2,
+                    activation=False,
+                ),
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=(1, 5),
+                    stride=1,
+                    padding=(0, 2),
+                    groups=out_channels,
+                    activation=False,
+                ),
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=(5, 1),
+                    stride=1,
+                    padding=(2, 0),
+                    groups=out_channels,
+                    activation=False,
+                ),
+                nn.AvgPool2d(kernel_size=2, stride=2),
+                nn.Sigmoid(),
+            )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.primary_conv(x)
+        x2 = self.cheap_operation(x1)
+        out = torch.cat([x1, x2], dim=1)
+        if self.mode == "original":
+            return out[:, : self.out_channels, ...]
+
+        return out[:, : self.out_channels, ...] * F.interpolate(
+            self.short_conv(x),
+            size=(out.shape[-2], out.shape[-1]),
+            mode="nearest",
+        )
+
+
+class GhostBottleneckV2(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        intermediate_channels: int,
+        out_channels: int,
+        dw_kernel_size: int = 3,
+        stride: int = 1,
+        se_ratio: float = 0.0,
+        *,
+        layer_id: int,
+    ):
+        super().__init__()
+        has_se = se_ratio is not None and se_ratio > 0.0
+        self.stride = stride
+
+        # Point-wise expansion
+        if layer_id <= 1:
+            self.ghost1 = GhostModuleV2(
+                in_channels,
+                intermediate_channels,
+                use_prelu=True,
+                mode="original",
+            )
+        else:
+            self.ghost1 = GhostModuleV2(
+                in_channels, intermediate_channels, use_prelu=True, mode="attn"
+            )
+
+        # Depth-wise convolution
+        if self.stride > 1:
+            self.conv_dw = nn.Conv2d(
+                intermediate_channels,
+                intermediate_channels,
+                dw_kernel_size,
+                stride=stride,
+                padding=(dw_kernel_size - 1) // 2,
+                groups=intermediate_channels,
+                bias=False,
+            )
+            self.bn_dw = nn.BatchNorm2d(intermediate_channels)
+
+        # Squeeze-and-excitation
+        if has_se:
+            reduced_chs = _make_divisible(intermediate_channels * se_ratio, 4)
+            self.se = SqueezeExciteBlock(
+                intermediate_channels, reduced_chs, True, activation=nn.PReLU()
+            )
+        else:
+            self.se = None
+
+        self.ghost2 = GhostModuleV2(
+            intermediate_channels,
+            out_channels,
+            use_prelu=False,
+            mode="original",
+        )
+
+        # shortcut
+        if in_channels == out_channels and self.stride == 1:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    dw_kernel_size,
+                    stride=stride,
+                    padding=(dw_kernel_size - 1) // 2,
+                    groups=in_channels,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(in_channels),
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+            )
+
+    def forward(self, x):
+        residual = x
+        x = self.ghost1(x)
+        if self.stride > 1:
+            x = self.conv_dw(x)
+            x = self.bn_dw(x)
+        if self.se is not None:
+            x = self.se(x)
+        x = self.ghost2(x)
+        x += self.shortcut(residual)
+        return x
diff --git a/luxonis_train/nodes/backbones/ghostfacenet/ghostfacenet.py b/luxonis_train/nodes/backbones/ghostfacenet/ghostfacenet.py
new file mode 100644
index 00000000..2ad0227c
--- /dev/null
+++ b/luxonis_train/nodes/backbones/ghostfacenet/ghostfacenet.py
@@ -0,0 +1,103 @@
+# Original source: https://github.com/Hazqeel09/ellzaf_ml/blob/main/ellzaf_ml/models/ghostfacenetsv2.py
+import math
+from typing import Literal
+
+import torch.nn as nn
+from torch import Tensor
+
+from luxonis_train.nodes.backbones.ghostfacenet.variants import get_variant
+from luxonis_train.nodes.backbones.micronet.blocks import _make_divisible
+from luxonis_train.nodes.base_node import BaseNode
+from luxonis_train.nodes.blocks import ConvModule
+
+
+class GhostFaceNetV2(BaseNode[Tensor, Tensor]):
+    in_channels: int
+    in_width: int
+
+    def __init__(self, variant: Literal["V2"] = "V2", **kwargs):
+        """GhostFaceNetsV2 backbone.
+
+        GhostFaceNetsV2 is a convolutional neural network architecture focused on face recognition, but it is
+        adaptable to generic embedding tasks. It is based on the GhostNet architecture and uses Ghost BottleneckV2 blocks.
+
+        Source: U{https://github.com/Hazqeel09/ellzaf_ml/blob/main/ellzaf_ml/models/ghostfacenetsv2.py}
+
+        @license: U{MIT License
+            <https://github.com/Hazqeel09/ellzaf_ml/blob/main/LICENSE>}
+
+        @see: U{GhostFaceNets: Lightweight Face Recognition Model From Cheap Operations
+            <https://www.researchgate.net/publication/369930264_GhostFaceNets_Lightweight_Face_Recognition_Model_from_Cheap_Operations>}
+
+        @type variant: Literal["V2"]
+        @param variant: Variant of the GhostFaceNets embedding model. Defaults to "V2" (which is the only variant available).
+        """
+        super().__init__(**kwargs)
+
+        var = get_variant(variant)
+        output_channel = _make_divisible(int(16 * var.width), 4)
+        input_channel = output_channel
+
+        stages: list[nn.Module] = [
+            ConvModule(
+                self.in_channels,
+                output_channel,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                activation=nn.PReLU(),
+            )
+        ]
+        layer_id = 0
+        for cfg in var.block_configs:
+            layers = []
+            for b_cfg in cfg:
+                output_channel = _make_divisible(
+                    b_cfg.output_channels * var.width, 4
+                )
+                hidden_channel = _make_divisible(
+                    b_cfg.expand_size * var.width, 4
+                )
+                layers.append(
+                    var.block(
+                        input_channel,
+                        hidden_channel,
+                        output_channel,
+                        b_cfg.kernel_size,
+                        b_cfg.stride,
+                        se_ratio=b_cfg.se_ratio,
+                        layer_id=layer_id,
+                    )
+                )
+                input_channel = output_channel
+                layer_id += 1
+            stages.append(nn.Sequential(*layers))
+
+        output_channel = _make_divisible(b_cfg.expand_size * var.width, 4)
+        stages.append(
+            ConvModule(
+                input_channel,
+                output_channel,
+                kernel_size=1,
+                activation=nn.PReLU(),
+            )
+        )
+
+        self.blocks = nn.Sequential(*stages)
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(m.weight)
+                negative_slope = 0.25
+                m.weight.data.normal_(
+                    0, math.sqrt(2.0 / (fan_in * (1 + negative_slope**2)))
+                )
+            if isinstance(m, nn.BatchNorm2d):
+                m.momentum = 0.9
+                m.eps = 1e-5
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.blocks(x)
diff --git a/luxonis_train/nodes/backbones/ghostfacenet/variants.py b/luxonis_train/nodes/backbones/ghostfacenet/variants.py
new file mode 100644
index 00000000..9e09befc
--- /dev/null
+++ b/luxonis_train/nodes/backbones/ghostfacenet/variants.py
@@ -0,0 +1,197 @@
+from typing import List, Literal
+
+from pydantic import BaseModel
+from torch import nn
+
+from .blocks import GhostBottleneckV2
+
+
+class BlockConfig(BaseModel):
+    kernel_size: int
+    expand_size: int
+    output_channels: int
+    stride: int
+    se_ratio: float
+
+
+class GhostFaceNetsVariant(BaseModel):
+    """Variant of the GhostFaceNets embedding model.
+
+    @type cfgs: List[List[BlockConfig]]
+    @param cfgs: List of Ghost BottleneckV2 configurations.
+    @type num_classes: int
+    @param num_classes: Number of classes. Defaults to 0, which makes
+        the network output the raw embeddings. Otherwise it can be used
+        to add another linear layer to the network, which is useful for
+        training using ArcFace or similar classification-based losses
+        that require the user to drop the last layer of the network.
+    @type width: int
+    @param width: Width multiplier. Increases complexity and number of
+        parameters. Defaults to 1.0.
+    @type dropout: float
+    @param dropout: Dropout rate. Defaults to 0.2.
+    @type block: nn.Module
+    @param block: Ghost BottleneckV2 block. Defaults to
+        GhostBottleneckV2.
+    @type bn_momentum: float
+    @param bn_momentum: Batch normalization momentum. Defaults to 0.9.
+    @type bn_epsilon: float
+    @param bn_epsilon: Batch normalization epsilon. Defaults to 1e-5.
+    @type init_kaiming: bool
+    @param init_kaiming: If True, initializes the weights using the
+        Kaiming initialization. Defaults to True.
+    @type block_args: dict
+    @param block_args: Arguments to pass to the block. Defaults to None.
+    """
+
+    width: int
+    block: type[nn.Module]
+    block_configs: List[List[BlockConfig]]
+
+
+V2 = GhostFaceNetsVariant(
+    width=1,
+    block=GhostBottleneckV2,
+    block_configs=[
+        [
+            BlockConfig(
+                kernel_size=3,
+                expand_size=16,
+                output_channels=16,
+                se_ratio=0.0,
+                stride=1,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=3,
+                expand_size=48,
+                output_channels=24,
+                se_ratio=0.0,
+                stride=2,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=3,
+                expand_size=72,
+                output_channels=24,
+                se_ratio=0.0,
+                stride=1,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=5,
+                expand_size=72,
+                output_channels=40,
+                se_ratio=0.25,
+                stride=2,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=5,
+                expand_size=120,
+                output_channels=40,
+                se_ratio=0.25,
+                stride=1,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=3,
+                expand_size=240,
+                output_channels=80,
+                se_ratio=0.0,
+                stride=2,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=3,
+                expand_size=200,
+                output_channels=80,
+                se_ratio=0.0,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=3,
+                expand_size=184,
+                output_channels=80,
+                se_ratio=0.0,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=3,
+                expand_size=184,
+                output_channels=80,
+                se_ratio=0.0,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=3,
+                expand_size=480,
+                output_channels=112,
+                se_ratio=0.25,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=3,
+                expand_size=672,
+                output_channels=112,
+                se_ratio=0.25,
+                stride=1,
+            ),
+        ],
+        [
+            BlockConfig(
+                kernel_size=5,
+                expand_size=672,
+                output_channels=160,
+                se_ratio=0.25,
+                stride=2,
+            )
+        ],
+        [
+            BlockConfig(
+                kernel_size=5,
+                expand_size=960,
+                output_channels=160,
+                se_ratio=0.0,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=5,
+                expand_size=960,
+                output_channels=160,
+                se_ratio=0.25,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=5,
+                expand_size=960,
+                output_channels=160,
+                se_ratio=0.0,
+                stride=1,
+            ),
+            BlockConfig(
+                kernel_size=5,
+                expand_size=960,
+                output_channels=160,
+                se_ratio=0.25,
+                stride=1,
+            ),
+        ],
+    ],
+)
+
+
+def get_variant(variant: Literal["V2"]) -> GhostFaceNetsVariant:
+    variants = {"V2": V2}
+    if variant not in variants:  # pragma: no cover
+        raise ValueError(
+            "GhostFaceNets model variant should be in "
+            f"{list(variants.keys())}, got {variant}."
+        )
+    return variants[variant].model_copy()
diff --git a/luxonis_train/nodes/backbones/micronet/blocks.py b/luxonis_train/nodes/backbones/micronet/blocks.py
index 3da5e15e..a1fd8f13 100644
--- a/luxonis_train/nodes/backbones/micronet/blocks.py
+++ b/luxonis_train/nodes/backbones/micronet/blocks.py
@@ -145,7 +145,7 @@ def _create_lite_block(
                 out_channels=out_channels,
                 kernel_size=1,
                 groups=group2,
-                activation=nn.Identity(),
+                activation=False,
             ),
             DYShiftMax(
                 out_channels,
@@ -179,7 +179,7 @@ def _create_transition_block(
                 out_channels=intermediate_channels,
                 kernel_size=1,
                 groups=group1,
-                activation=nn.Identity(),
+                activation=False,
             ),
             DYShiftMax(
                 intermediate_channels,
@@ -217,7 +217,7 @@ def _create_full_block(
                 out_channels=intermediate_channels,
                 kernel_size=1,
                 groups=groups_1[0],
-                activation=nn.Identity(),
+                activation=False,
             ),
             DYShiftMax(
                 intermediate_channels,
@@ -256,7 +256,7 @@ def _create_full_block(
                 out_channels=out_channels,
                 kernel_size=1,
                 groups=group1,
-                activation=nn.Identity(),
+                activation=False,
             ),
             DYShiftMax(
                 out_channels,
@@ -357,7 +357,7 @@ def __init__(
 
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 
-        squeeze_channels = self._make_divisible(in_channels // reduction, 4)
+        squeeze_channels = _make_divisible(in_channels // reduction, 4)
 
         self.fc = nn.Sequential(
             nn.Linear(in_channels, squeeze_channels),
@@ -413,16 +413,14 @@ def forward(self, x: Tensor) -> Tensor:
 
         return out
 
-    def _make_divisible(
-        self, value: int, divisor: int, min_value: int | None = None
-    ) -> int:
-        if min_value is None:
-            min_value = divisor
-        new_v = max(min_value, int(value + divisor / 2) // divisor * divisor)
-        # Make sure that round down does not go down by more than 10%.
-        if new_v < 0.9 * value:
-            new_v += divisor
-        return new_v
+
+def _make_divisible(value: int, divisor: int) -> int:
+    min_value = divisor
+    new_v = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * value:
+        new_v += divisor
+    return new_v
 
 
 class SpatialSepConvSF(nn.Module):
diff --git a/luxonis_train/nodes/backbones/mobileone/blocks.py b/luxonis_train/nodes/backbones/mobileone/blocks.py
index 4b926038..54017aa0 100644
--- a/luxonis_train/nodes/backbones/mobileone/blocks.py
+++ b/luxonis_train/nodes/backbones/mobileone/blocks.py
@@ -91,7 +91,7 @@ def __init__(
                     stride=self.stride,
                     padding=padding,
                     groups=self.groups,
-                    activation=nn.Identity(),
+                    activation=False,
                 )
             )
         self.rbr_conv: list[nn.Sequential] = nn.ModuleList(rbr_conv)  # type: ignore
@@ -106,7 +106,7 @@ def __init__(
                 stride=self.stride,
                 padding=0,
                 groups=self.groups,
-                activation=nn.Identity(),
+                activation=False,
             )
 
     def forward(self, inputs: Tensor) -> Tensor:
diff --git a/luxonis_train/nodes/backbones/rexnetv1.py b/luxonis_train/nodes/backbones/rexnetv1.py
index 6567586a..c34dbafe 100644
--- a/luxonis_train/nodes/backbones/rexnetv1.py
+++ b/luxonis_train/nodes/backbones/rexnetv1.py
@@ -202,7 +202,7 @@ def __init__(
                 in_channels=dw_channels,
                 out_channels=channels,
                 kernel_size=1,
-                activation=nn.Identity(),
+                activation=False,
             )
         )
 
diff --git a/luxonis_train/nodes/base_node.py b/luxonis_train/nodes/base_node.py
index 0a9a208a..409a9fe3 100644
--- a/luxonis_train/nodes/base_node.py
+++ b/luxonis_train/nodes/base_node.py
@@ -9,7 +9,7 @@
 from torch import Size, Tensor, nn
 from typeguard import TypeCheckError, check_type
 
-from luxonis_train.enums import TaskType
+from luxonis_train.enums import Metadata, Task, TaskType
 from luxonis_train.utils import (
     AttachIndexType,
     DatasetMetadata,
@@ -107,7 +107,7 @@ def wrap(output: Tensor) -> Packet[Tensor]:
     """
 
     attach_index: AttachIndexType
-    tasks: list[TaskType] | None = None
+    tasks: list[Task] | None = None
 
     def __init__(
         self,
@@ -122,6 +122,7 @@ def __init__(
         export_output_names: list[str] | None = None,
         attach_index: AttachIndexType | None = None,
         task_name: str | None = None,
+        metadata_task_override: str | dict[str, str] | None = None,
     ):
         """Constructor for the C{BaseNode}.
 
@@ -176,6 +177,35 @@ def __init__(
                     f"argument for node '{self.name}' was not provided."
                 )
         self.task_name = task_name or ""
+        self.metadata_task_override = {}
+        if metadata_task_override is not None:
+            if self.tasks is None:
+                raise ValueError(
+                    "Metadata task override can only be used with nodes that define tasks."
+                )
+            n_metadata_tasks = sum(
+                1 for task in self.tasks if isinstance(task, Metadata)
+            )
+            if n_metadata_tasks > 1 and isinstance(
+                metadata_task_override, str
+            ):
+                raise ValueError(
+                    f"Node '{self.name}' defines multiple metadata tasks, "
+                    "but only a single task name was provided for "
+                    "`metadata_task_override`. Provide a dictionary "
+                    "mapping default names to new names instead ."
+                )
+            for task in self.tasks:
+                if not isinstance(task, Metadata):
+                    continue
+
+                if isinstance(metadata_task_override, dict):
+                    new_name = metadata_task_override.get(task.name, task.name)
+                else:
+                    new_name = metadata_task_override
+
+                self.metadata_task_override[task.name] = new_name
+                task.name = new_name
 
         if getattr(self, "attach_index", None) is None:
             parameters = inspect.signature(self.forward).parameters
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index fa9912a8..dfacfeab 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -165,13 +165,13 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: int,
-        stride: int = 1,
-        padding: int = 0,
-        dilation: int = 1,
+        kernel_size: int | tuple[int, int],
+        stride: int | tuple[int, int] = 1,
+        padding: int | tuple[int, int] = 0,
+        dilation: int | tuple[int, int] = 1,
         groups: int = 1,
         bias: bool = False,
-        activation: nn.Module | None = None,
+        activation: nn.Module | None | Literal[False] = None,
         use_norm: bool = True,
     ):
         """Conv2d + Optional BN + Activation.
@@ -192,12 +192,13 @@ def __init__(
         @param groups: Groups. Defaults to 1.
         @type bias: bool
         @param bias: Whether to use bias. Defaults to False.
-        @type activation: L{nn.Module} | None
-        @param activation: Activation function. If None then nn.ReLU.
+        @type activation: L{nn.Module} | None | Literal[False]
+        @param activation: Activation function. If None then nn.ReLU. If
+            False then no activation. Defaults to None.
         @type use_norm: bool
         @param use_norm: Whether to use normalization. Defaults to True.
         """
-        super().__init__(
+        blocks: list[nn.Module] = [
             nn.Conv2d(
                 in_channels,
                 out_channels,
@@ -208,9 +209,15 @@ def __init__(
                 groups,
                 bias,
             ),
-            nn.BatchNorm2d(out_channels) if use_norm else nn.Identity(),
-            activation or nn.ReLU(),
-        )
+        ]
+
+        if use_norm:
+            blocks.append(nn.BatchNorm2d(out_channels))
+
+        if activation is not False:
+            blocks.append(activation or nn.ReLU())
+
+        super().__init__(*blocks)
 
 
 class DWConvModule(ConvModule):
@@ -443,7 +450,7 @@ def __init__(
             stride=stride,
             padding=padding,
             groups=groups,
-            activation=nn.Identity(),
+            activation=False,
         )
         self.rbr_1x1 = ConvModule(
             in_channels=in_channels,
@@ -452,7 +459,7 @@ def __init__(
             stride=stride,
             padding=padding_11,
             groups=groups,
-            activation=nn.Identity(),
+            activation=False,
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -728,7 +735,7 @@ def __init__(self, in_channels: int, out_channels: int):
                 in_channels=out_channels,
                 out_channels=out_channels,
                 kernel_size=1,
-                activation=nn.Identity(),
+                activation=False,
             ),
             nn.Sigmoid(),
         )
@@ -768,7 +775,7 @@ def __init__(
                 in_channels=out_channels,
                 out_channels=out_channels // reduction,
                 kernel_size=1,
-                activation=nn.Identity(),
+                activation=False,
             ),
             nn.Sigmoid(),
         )
diff --git a/luxonis_train/nodes/heads/__init__.py b/luxonis_train/nodes/heads/__init__.py
index 6ebcf816..96843207 100644
--- a/luxonis_train/nodes/heads/__init__.py
+++ b/luxonis_train/nodes/heads/__init__.py
@@ -6,6 +6,7 @@
 from .efficient_bbox_head import EfficientBBoxHead
 from .efficient_keypoint_bbox_head import EfficientKeypointBBoxHead
 from .fomo_head import FOMOHead
+from .ghostfacenet_head import GhostFaceNetHead
 from .precision_bbox_head import PrecisionBBoxHead
 from .precision_seg_bbox_head import PrecisionSegmentBBoxHead
 from .segmentation_head import SegmentationHead
@@ -19,6 +20,7 @@
     "SegmentationHead",
     "DDRNetSegmentationHead",
     "DiscSubNetHead",
+    "GhostFaceNetHead",
     "FOMOHead",
     "PrecisionBBoxHead",
     "PrecisionSegmentBBoxHead",
diff --git a/luxonis_train/nodes/heads/ghostfacenet_head.py b/luxonis_train/nodes/heads/ghostfacenet_head.py
new file mode 100644
index 00000000..4b7dcb06
--- /dev/null
+++ b/luxonis_train/nodes/heads/ghostfacenet_head.py
@@ -0,0 +1,82 @@
+# Original source: https://github.com/Hazqeel09/ellzaf_ml/blob/main/ellzaf_ml/models/ghostfacenetsv2.py
+import math
+
+import torch.nn as nn
+from torch import Tensor
+
+from luxonis_train.enums import Metadata
+from luxonis_train.nodes.base_node import BaseNode
+from luxonis_train.nodes.blocks.blocks import ConvModule
+
+
+class GhostFaceNetHead(BaseNode[Tensor, list[Tensor]]):
+    in_channels: int
+    in_width: int
+    tasks = [Metadata("id")]
+
+    def __init__(
+        self,
+        embedding_size: int = 512,
+        cross_batch_memory_size: int | None = None,
+        dropout: float = 0.2,
+        **kwargs,
+    ):
+        """GhostFaceNetV2 backbone.
+
+        GhostFaceNetV2 is a convolutional neural network architecture focused on face recognition, but it is
+        adaptable to generic embedding tasks. It is based on the GhostNet architecture and uses Ghost BottleneckV2 blocks.
+
+        Source: U{https://github.com/Hazqeel09/ellzaf_ml/blob/main/ellzaf_ml/models/ghostfacenetsv2.py}
+
+        @license: U{MIT License
+            <https://github.com/Hazqeel09/ellzaf_ml/blob/main/LICENSE>}
+
+        @see: U{GhostFaceNets: Lightweight Face Recognition Model From Cheap Operations
+            <https://www.researchgate.net/publication/369930264_GhostFaceNets_Lightweight_Face_Recognition_Model_from_Cheap_Operations>}
+
+        @type embedding_size: int
+        @param embedding_size: Size of the embedding. Defaults to 512.
+        @type cross_batch_memory_size: int | None
+        @param cross_batch_memory_size: Size of the cross-batch memory. Defaults to None.
+        @type dropout: float
+        @param dropout: Dropout rate. Defaults to 0.2.
+        """
+        super().__init__(**kwargs)
+        self.embedding_size = embedding_size
+        self.cross_batch_memory_size = cross_batch_memory_size
+        _, H, W = self.original_in_shape
+
+        self.head = nn.Sequential(
+            ConvModule(
+                self.in_channels,
+                self.in_channels,
+                kernel_size=(
+                    H // 32 if H % 32 == 0 else H // 32 + 1,
+                    W // 32 if W % 32 == 0 else W // 32 + 1,
+                ),
+                groups=self.in_channels,
+                activation=False,
+            ),
+            nn.Dropout(dropout),
+            nn.Conv2d(
+                self.in_channels, embedding_size, kernel_size=1, bias=False
+            ),
+            nn.Flatten(),
+            nn.BatchNorm1d(embedding_size),
+        )
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(m.weight)
+                negative_slope = 0.25
+                m.weight.data.normal_(
+                    0, math.sqrt(2.0 / (fan_in * (1 + negative_slope**2)))
+                )
+            if isinstance(m, nn.BatchNorm2d):
+                m.momentum = 0.9
+                m.eps = 1e-5
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.head(x)
diff --git a/luxonis_train/utils/dataset_metadata.py b/luxonis_train/utils/dataset_metadata.py
index fdbec775..2c10905c 100644
--- a/luxonis_train/utils/dataset_metadata.py
+++ b/luxonis_train/utils/dataset_metadata.py
@@ -62,7 +62,7 @@ def n_classes(self, task: str | None = None) -> int:
         for classes in self._classes.values():
             if len(classes) != n_classes:
                 raise RuntimeError(
-                    "The dataset contains different number of classes for different tasks."
+                    "The dataset contains different number of classes for different tasks. "
                     "Please specify the 'task' argument to get the number of classes."
                 )
         return n_classes
@@ -90,7 +90,7 @@ def n_keypoints(self, task: str | None = None) -> int:
         for n in self._n_keypoints.values():
             if n != n_keypoints:
                 raise RuntimeError(
-                    "The dataset contains different number of keypoints for different tasks."
+                    "The dataset contains different number of keypoints for different tasks. "
                     "Please specify the 'task' argument to get the number of keypoints."
                 )
         return n_keypoints
diff --git a/requirements.txt b/requirements.txt
index 5ef87b3a..bd4e663a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,6 @@ mlflow>=2.10.0
 psutil>=5.0.0
 tabulate>=0.9.0
 grad-cam>=1.5.4
+pytorch_metric_learning>=2.7.0
+scikit-learn>=1.5.0
+seaborn>=1.16.0
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index ab2fb1e8..92fc8720 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -4,10 +4,12 @@
 from pathlib import Path
 from typing import Any
 
+import cv2
 import gdown
+import numpy as np
 import pytest
 import torchvision
-from luxonis_ml.data import LuxonisDataset
+from luxonis_ml.data import Category, LuxonisDataset
 from luxonis_ml.data.parsers import LuxonisParser
 from luxonis_ml.utils import environ
 
@@ -38,13 +40,40 @@ def parking_lot_dataset() -> LuxonisDataset:
     url = "gs://luxonis-test-bucket/luxonis-ml-test-data/D1_ParkingLot_Native.zip"
     parser = LuxonisParser(
         url,
-        dataset_name="_D1_ParkingLot",
+        dataset_name="D1_ParkingLot",
         delete_existing=True,
         save_dir=WORK_DIR,
     )
     return parser.parse(random_split=True)
 
 
+@pytest.fixture(scope="session")
+def embedding_dataset() -> LuxonisDataset:
+    img_dir = WORK_DIR / "embedding_images"
+    img_dir.mkdir(exist_ok=True)
+
+    def generator():
+        for i in range(100):
+            color = [(255, 0, 0), (0, 255, 0), (0, 0, 255)][i % 3]
+            img = np.full((100, 100, 3), color, dtype=np.uint8)
+            img[i, i] = 255
+            cv2.imwrite(str(img_dir / f"image_{i}.png"), img)
+
+            yield {
+                "file": img_dir / f"image_{i}.png",
+                "annotation": {
+                    "metadata": {
+                        "color": Category(["red", "green", "blue"][i % 3]),
+                    },
+                },
+            }
+
+    dataset = LuxonisDataset("embedding_test", delete_existing=True)
+    dataset.add(generator())
+    dataset.make_splits()
+    return dataset
+
+
 @pytest.fixture(scope="session")
 def coco_dataset() -> LuxonisDataset:
     dataset_name = "coco_test"
diff --git a/tests/integration/test_embeddings.py b/tests/integration/test_embeddings.py
new file mode 100644
index 00000000..ea1a4868
--- /dev/null
+++ b/tests/integration/test_embeddings.py
@@ -0,0 +1,15 @@
+from luxonis_ml.data import LuxonisDataset
+
+from luxonis_train.core import LuxonisModel
+
+
+def test_embeddings_model(embedding_dataset: LuxonisDataset):
+    model = LuxonisModel(
+        cfg="configs/embeddings_model.yaml",
+        opts={
+            "loader.params.dataset_name": embedding_dataset.dataset_name,
+            "trainer.epochs": 1,
+            "trainer.validation_interval": 1,
+        },
+    )
+    model.train()
diff --git a/tests/unittests/test_base_attached_module.py b/tests/unittests/test_base_attached_module.py
index c7cd1508..450242b9 100644
--- a/tests/unittests/test_base_attached_module.py
+++ b/tests/unittests/test_base_attached_module.py
@@ -148,12 +148,12 @@ def test_prepare(inputs: Packet[Tensor], labels: Labels):
     det_head = DummyDetectionHead()
 
     assert seg_loss.prepare(inputs, labels) == (
-        SEGMENTATION_ARRAY,
+        [SEGMENTATION_ARRAY],
         SEGMENTATION_ARRAY,
     )
     inputs["/segmentation"].append(FEATURES_ARRAY)
     assert seg_loss.prepare(inputs, labels) == (
-        FEATURES_ARRAY,
+        [SEGMENTATION_ARRAY, FEATURES_ARRAY],
         SEGMENTATION_ARRAY,
     )
 
diff --git a/tests/unittests/test_callbacks/test_ema.py b/tests/unittests/test_callbacks/test_ema.py
index e9d2db7f..51fc2505 100644
--- a/tests/unittests/test_callbacks/test_ema.py
+++ b/tests/unittests/test_callbacks/test_ema.py
@@ -9,7 +9,7 @@
 
 class SimpleModel(LightningModule):
     def __init__(self):
-        super(SimpleModel, self).__init__()
+        super().__init__()
         self.layer = torch.nn.Linear(2, 2)
 
     def forward(self, x):