unicef · domdinicola · Oct 31, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 29, 2024
diff --git a/src/hope_dedup_engine/apps/api/admin/duplicate.py b/src/hope_dedup_engine/apps/api/admin/duplicate.py
@@ -1,10 +1,7 @@
 from django.contrib.admin import ModelAdmin, register
 
-from adminfilters.filters import (
-    DjangoLookupFilter,
-    NumberFilter,
-    RelatedFieldComboFilter,
-)
+from adminfilters.autocomplete import AutoCompleteFilter
+from adminfilters.filters import DjangoLookupFilter, NumberFilter
 from adminfilters.mixin import AdminFiltersMixin
 
 from hope_dedup_engine.apps.api.models import Duplicate
@@ -20,7 +17,7 @@ class DuplicateAdmin(AdminFiltersMixin, ModelAdmin):
         "second_reference_pk",
     )
     list_filter = (
-        ("deduplication_set", RelatedFieldComboFilter),
+        ("deduplication_set", AutoCompleteFilter),
         ("score", NumberFilter),
         DjangoLookupFilter,
     )

diff --git a/src/hope_dedup_engine/apps/api/admin/image.py b/src/hope_dedup_engine/apps/api/admin/image.py
@@ -1,7 +1,8 @@
 from django.contrib.admin import ModelAdmin, register
 
+from adminfilters.autocomplete import AutoCompleteFilter
 from adminfilters.dates import DateRangeFilter
-from adminfilters.filters import DjangoLookupFilter, RelatedFieldComboFilter
+from adminfilters.filters import DjangoLookupFilter
 from adminfilters.mixin import AdminFiltersMixin
 
 from hope_dedup_engine.apps.api.models import Image
@@ -17,7 +18,7 @@ class ImageAdmin(AdminFiltersMixin, ModelAdmin):
     )
 
     list_filter = (
-        ("deduplication_set", RelatedFieldComboFilter),
+        ("deduplication_set", AutoCompleteFilter),
         ("created_at", DateRangeFilter),
         DjangoLookupFilter,
     )
@@ -27,6 +28,3 @@ def has_add_permission(self, request):
 
     def has_change_permission(self, request, obj=None):
         return False
-
-    def has_delete_permission(self, request, obj=None):
-        return obj is not None
diff --git a/src/hope_dedup_engine/apps/api/deduplication/process.py b/src/hope_dedup_engine/apps/api/deduplication/process.py
@@ -94,10 +94,11 @@ def find_duplicates(deduplication_set_id: str, serialized_lock: str) -> None:
         deduplication_set.state = deduplication_set.State.CLEAN
         deduplication_set.save()
 
-        if lock_enabled:
-            lock.release()
-
     except Exception:
         deduplication_set.state = DeduplicationSet.State.ERROR
         deduplication_set.save()
         raise
+
+    finally:
+        if lock_enabled:
+            lock.release()
diff --git a/src/hope_dedup_engine/apps/api/models/deduplication.py b/src/hope_dedup_engine/apps/api/models/deduplication.py
@@ -18,7 +18,7 @@ class Config(models.Model):
     )
 
     def __str__(self) -> str:
-        return " | ".join(
+        return f"{self.pk}: " + " | ".join(
             f"{field.name}: {getattr(self, field.name)}"
             for field in self._meta.fields
             if field.name not in ("id",)

diff --git a/src/hope_dedup_engine/apps/core/exceptions.py b/src/hope_dedup_engine/apps/core/exceptions.py
@@ -16,3 +16,12 @@ class DownloaderKeyError(Exception):
     def __init__(self, key: str) -> None:
         self.key = key
         super().__init__(f"Downloader key '{key}' does not exist.")
+
+
+class NotCompliantImageError(Exception):
+    """
+    Exception raised when an image is not compliant with the expected parameters.
+    """
+
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
diff --git a/src/hope_dedup_engine/apps/faces/services/duplication_detector.py b/src/hope_dedup_engine/apps/faces/services/duplication_detector.py
@@ -125,22 +125,24 @@ def find_duplicates(self) -> Generator[tuple[str, str, float], None, None]:
             encodings_all = self._load_encodings_all()
 
             for path1, path2 in combinations(existed_images_name, 2):
-                min_distance = self.face_distance_threshold
                 encodings1 = encodings_all.get(path1)
                 encodings2 = encodings_all.get(path2)
                 if encodings1 is None or encodings2 is None:
                     continue
 
+                min_distance = None
                 for encoding1 in encodings1:
-                    if (
-                        current_min := min(
-                            face_recognition.face_distance(encodings2, encoding1)
-                        )
-                    ) < min_distance:
+                    distances = face_recognition.face_distance(encodings2, encoding1)
+                    current_min = min(distances) if np.any(distances) else 0
+                    if min_distance is None or current_min < min_distance:
                         min_distance = current_min
 
-                if min_distance < self.face_distance_threshold:
+                if (
+                    min_distance is not None
+                    and min_distance < self.face_distance_threshold
+                ):
                     yield (path1, path2, round(min_distance, 5))
+
         except Exception as e:
             self.logger.exception(
                 "Error finding duplicates for images %s", self.filenames

diff --git a/src/hope_dedup_engine/apps/faces/services/image_processor.py b/src/hope_dedup_engine/apps/faces/services/image_processor.py
@@ -11,6 +11,7 @@
 import numpy as np
 from constance import config
 
+from hope_dedup_engine.apps.core.exceptions import NotCompliantImageError
 from hope_dedup_engine.apps.faces.managers import DNNInferenceManager, StorageManager
 
 
@@ -97,19 +98,19 @@ def _get_face_detections_dnn(
                 # Decode image from binary buffer to 3D numpy array (height, width, channels of BlueGreeRed color space)
                 image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
             (h, w) = image.shape[:2]
+            _h, _w = (
+                self.blob_from_image_cfg.shape["height"],
+                self.blob_from_image_cfg.shape["width"],
+            )
+            if h < _h or w < _w:
+                raise NotCompliantImageError(
+                    f"Image {filename} too small: '{h}x{w}'. It needs to be at least '{_h}x{_w}'."
+                )
+
             # Create a blob (4D tensor) from the image
             blob = cv2.dnn.blobFromImage(
-                image=cv2.resize(
-                    image,
-                    dsize=(
-                        self.blob_from_image_cfg.shape["height"],
-                        self.blob_from_image_cfg.shape["width"],
-                    ),
-                ),
-                size=(
-                    self.blob_from_image_cfg.shape["height"],
-                    self.blob_from_image_cfg.shape["width"],
-                ),
+                image=cv2.resize(image, dsize=(_h, _w)),
+                size=(_h, _w),
                 scalefactor=self.blob_from_image_cfg.scale_factor,
                 mean=self.blob_from_image_cfg.mean_values,
             )
@@ -147,6 +148,30 @@ def _get_face_detections_dnn(
             raise e
         return face_regions
 
+    def _preprocess_image(self, filename: str) -> np.ndarray:
+        """
+        This function retrieves an image from the 'images' storage, reads it as an array of bytes,
+        and decodes it into a color image.
+
+        The image's color space is first converted from BGR (Blue, Green, Red) to YUV. Histogram equalization is then
+        applied to the Y channel (luminance) to enhance the contrast of the image.
+
+        Finally, the image is converted to RGB color space for further processing.
+
+        Args:
+        filename (str): The filename of the image to preprocess.
+
+        Returns:
+        np.ndarray: The preprocessed image as a NumPy array in RGB format.
+        """
+        with self.storages.get_storage("images").open(filename, "rb") as img_file:
+            img_array = np.asarray(bytearray(img_file.read()), dtype=np.uint8)
+        image = cv2.cvtColor(
+            cv2.imdecode(img_array, cv2.IMREAD_COLOR), cv2.COLOR_BGR2YUV
+        )
+        image[:, :, 0] = cv2.equalizeHist(image[:, :, 0])
+        return cv2.cvtColor(image, cv2.COLOR_YUV2RGB)
+
     def encode_face(self, filename: str, encodings_filename: str) -> None:
         """
         Encode faces detected in an image and save the encodings to storage.
@@ -156,19 +181,20 @@ def encode_face(self, filename: str, encodings_filename: str) -> None:
             encodings_filename (str): The filename to save the face encodings.
         """
         try:
-            with self.storages.get_storage("images").open(filename, "rb") as img_file:
-                image = face_recognition.load_image_file(img_file)
             encodings: list[np.ndarray[np.float32, Any]] = []
+            image = self._preprocess_image(filename)
             face_regions = self._get_face_detections_dnn(filename)
             if not face_regions:
-                self.logger.warning("No face regions detected in image %s", filename)
+                raise NotCompliantImageError(
+                    f"No face regions detected in image '{filename}'."
+                )
             else:
                 for region in face_regions:
                     if isinstance(region, (list, tuple)) and len(region) == 4:
                         top, right, bottom, left = region
                         face_encodings = face_recognition.face_encodings(
                             image,
-                            [(top, right, bottom, left)],
+                            [(right, bottom, left, top)],
                             num_jitters=self.face_encodings_cfg.num_jitters,
                             model=self.face_encodings_cfg.model,
                         )

diff --git a/src/hope_dedup_engine/config/fragments/constance.py b/src/hope_dedup_engine/config/fragments/constance.py
@@ -40,7 +40,7 @@
         "tuple_field",
     ),
     "FACE_DETECTION_CONFIDENCE": (
-        0.5,
+        0.7,
         """
         Specifies the minimum confidence score required for a detected face to be considered valid. Detections
         with confidence scores below this threshold are discarded as likely false positives.
@@ -67,7 +67,7 @@
         int,
     ),
     "FACE_ENCODINGS_MODEL": (
-        "small",
+        "large",
         """
         Specifies the model type used for encoding face landmarks. It can be either 'small' which is faster and
         detects only 5 key facial landmarks, or 'large' which is more precise and identifies 68 key facial landmarks
@@ -76,11 +76,12 @@
         "face_encodings_model",
     ),
     "FACE_DISTANCE_THRESHOLD": (
-        0.4,
+        0.26,
         """
-        Specifies the maximum allowable distance between two face embeddings for them to be considered a match. It helps
-        determine if two faces belong to the same person by setting a threshold for similarity. Lower values result in
-        stricter matching, while higher values allow for more lenient matches.
+        Specifies the maximum allowable distance between two face embeddings for them to be considered a match.
+        This tolerance threshold is crucial for assessing whether two faces belong to the same individual,
+        as it establishes the similarity limit. Lower values result in stricter matching, while higher values allow
+        for more lenient matches.
         """,
         float,
     ),

diff --git a/tests/extras/demoapp/demo_images/Aaron_Eckhart_0001.jpg b/tests/extras/demoapp/demo_images/Aaron_Eckhart_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/Aaron_Guiel_0001.jpg b/tests/extras/demoapp/demo_images/Aaron_Guiel_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/Aaron_Peirsol_0001.jpg b/tests/extras/demoapp/demo_images/Aaron_Peirsol_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/Aaron_Peirsol_0002.jpg b/tests/extras/demoapp/demo_images/Aaron_Peirsol_0002.jpg
diff --git a/tests/extras/demoapp/demo_images/Cathy_Freeman_0001.jpg b/tests/extras/demoapp/demo_images/Cathy_Freeman_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/Cathy_Freeman_0002.jpg b/tests/extras/demoapp/demo_images/Cathy_Freeman_0002.jpg
diff --git a/tests/extras/demoapp/demo_images/Ziwang_Xu_0001.jpg b/tests/extras/demoapp/demo_images/Ziwang_Xu_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/Zoe_Ball_0001.jpg b/tests/extras/demoapp/demo_images/Zoe_Ball_0001.jpg
diff --git a/tests/extras/demoapp/demo_images/too_small.jpg b/tests/extras/demoapp/demo_images/too_small.jpg
diff --git a/tests/extras/demoapp/demo_images/without_face.jpg b/tests/extras/demoapp/demo_images/without_face.jpg
diff --git a/tests/extras/demoapp/scripts/base_case b/tests/extras/demoapp/scripts/base_case
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+start_time=$(date +%s)
+
 source "$(dirname "0")/.common"
 
 if [[ $# -ne 1 ]] ; then
@@ -9,16 +11,16 @@ fi
 
 ./create_deduplication_set "$1" | jq -r ".id" | xargs ./use_deduplication_set
 
-./create_image Aaron_Eckhart_0001.jpg
-./create_image Aaron_Guiel_0001.jpg
-./create_image Aaron_Peirsol_0001.jpg
-./create_image Aaron_Peirsol_0002.jpg
-./create_image Cathy_Freeman_0001.jpg
-./create_image Cathy_Freeman_0002.jpg
-./create_image without_face.jpg
-./create_image Ziwang_Xu_0001.jpg
-./create_image Zoe_Ball_0001.jpg
+for file in ../demo_images/*.{jpg,png}; do
+  if [[ -f "$file" ]]; then
+    ./create_image $(basename "$file")
+  fi
+done
+
 
 ./process_deduplication_set
 
 ./show_duplicates
+
+duration=$(( $(date +%s) - start_time ))
+echo "Duration: $duration seconds."
diff --git a/tests/faces/conftest.py b/tests/faces/conftest.py
@@ -129,7 +129,7 @@ def mock_image_processor(
 @pytest.fixture
 def image_bytes_io():
     img_byte_arr = BytesIO()
-    image = Image.new("RGB", (100, 100), color="red")
+    image = Image.new("RGB", (300, 300), color="red")
     image.save(img_byte_arr, format="JPEG")
     img_byte_arr.seek(0)
     img_byte_arr.fake_open = lambda *_: BytesIO(img_byte_arr.getvalue())

diff --git a/tests/faces/faces_const.py b/tests/faces/faces_const.py
@@ -8,7 +8,7 @@
     ["ignore_file.jpg", "ignore_file2.jpg"],
     ["ignore_file4.jpg", "ignore_file3.jpg"],
 ]
-FACE_DISTANCE_THRESHOLD = 0.4
+FACE_DISTANCE_THRESHOLD = 0.26
 
 DNN_FILE = {
     "name": FILENAME,
@@ -44,8 +44,8 @@
 }
 FACE_REGIONS_INVALID: Final[list[list[tuple[int, int, int, int]]]] = [[], [(0, 0, 10)]]
 FACE_REGIONS_VALID: Final[list[tuple[int, int, int, int]]] = [
-    (10, 10, 20, 20),
-    (30, 30, 40, 40),
+    (40, 40, 80, 80),
+    (120, 120, 160, 160),
 ]
 BLOB_FROM_IMAGE_SCALE_FACTOR: Final[float] = 1.0
 BLOB_FROM_IMAGE_MEAN_VALUES: Final[tuple[float, float, float]] = (104.0, 177.0, 123.0)
@@ -56,8 +56,8 @@
     (0, 0, 0.15, 0.1, 0.1, 0.2, 0.2),  # with confidence 0.15 -> invalid detection
 ]
 IMAGE_SIZE: Final[tuple[int, int, int]] = (
-    100,
-    100,
+    400,
+    400,
     3,
 )  # Size of the image after decoding (h, w, number of channels)
 RESIZED_IMAGE_SIZE: Final[tuple[int, int, int]] = (

diff --git a/tests/faces/test_duplication_detector.py b/tests/faces/test_duplication_detector.py
@@ -205,12 +205,12 @@ def open_mock(filename, mode="rb"):
                 (
                     "test_file.jpg",
                     "test_file2.jpg",
-                    0.36,
-                ),  # config.FACE_DISTANCE_THRESHOLD + 0.04
+                    0.22,
+                ),  # config.FACE_DISTANCE_THRESHOLD - 0.04
                 (
                     "test_file.jpg",
                     "test_file3.jpg",
-                    0.2,
+                    0.06,
                 ),  # config.FACE_DISTANCE_THRESHOLD - 0.2
                 # last pair will not be included in the result because the distance is greater than the threshold
                 # ("test_file2.jpg", "test_file3.jpg", 0.44), # config.FACE_DISTANCE_THRESHOLD + 0.04

diff --git a/tests/faces/test_image_processor.py b/tests/faces/test_image_processor.py
@@ -131,15 +131,13 @@ def test_encode_face(mock_image_processor, image_bytes_io, face_regions):
         patch.object(
             mock_image_processor, "_get_face_detections_dnn", return_value=face_regions
         ) as mock_get_face_detections_dnn,
-        patch.object(face_recognition, "load_image_file") as mock_load_image_file,
         patch.object(face_recognition, "face_encodings") as mock_face_encodings,
     ):
         mock_image_processor.encode_face(FILENAME, FILENAME_ENCODED)
 
         mock_get_face_detections_dnn.assert_called_once()
         mocked_image_open.assert_called_with(FILENAME, "rb")
         assert mocked_image_open.side_effect == image_bytes_io.fake_open
-        mock_load_image_file.assert_called()
 
         if face_regions == FACE_REGIONS_VALID:
             mocked_encoded_open.assert_called_with(FILENAME_ENCODED, "wb")
@@ -152,10 +150,7 @@ def test_encode_face(mock_image_processor, image_bytes_io, face_regions):
 
 @pytest.mark.parametrize(
     "method, exception_str",
-    (
-        (str("load_image_file"), "Test load_image_file exception"),
-        (str("face_encodings"), "Test face_encodings exception"),
-    ),
+    ((str("face_encodings"), "Test face_encodings exception"),),
 )
 def test_encode_face_exception_handling(
     mock_image_processor, mock_net, method: str, exception_str