Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/finddup #105

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions src/hope_dedup_engine/apps/api/admin/duplicate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from django.contrib.admin import ModelAdmin, register

from adminfilters.filters import (
DjangoLookupFilter,
NumberFilter,
RelatedFieldComboFilter,
)
from adminfilters.autocomplete import AutoCompleteFilter
from adminfilters.filters import DjangoLookupFilter, NumberFilter
from adminfilters.mixin import AdminFiltersMixin

from hope_dedup_engine.apps.api.models import Duplicate
Expand All @@ -20,7 +17,7 @@ class DuplicateAdmin(AdminFiltersMixin, ModelAdmin):
"second_reference_pk",
)
list_filter = (
("deduplication_set", RelatedFieldComboFilter),
("deduplication_set", AutoCompleteFilter),
("score", NumberFilter),
DjangoLookupFilter,
)
Expand Down
8 changes: 3 additions & 5 deletions src/hope_dedup_engine/apps/api/admin/image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from django.contrib.admin import ModelAdmin, register

from adminfilters.autocomplete import AutoCompleteFilter
from adminfilters.dates import DateRangeFilter
from adminfilters.filters import DjangoLookupFilter, RelatedFieldComboFilter
from adminfilters.filters import DjangoLookupFilter
from adminfilters.mixin import AdminFiltersMixin

from hope_dedup_engine.apps.api.models import Image
Expand All @@ -17,7 +18,7 @@ class ImageAdmin(AdminFiltersMixin, ModelAdmin):
)

list_filter = (
("deduplication_set", RelatedFieldComboFilter),
("deduplication_set", AutoCompleteFilter),
("created_at", DateRangeFilter),
DjangoLookupFilter,
)
Expand All @@ -27,6 +28,3 @@ def has_add_permission(self, request):

def has_change_permission(self, request, obj=None):
return False

def has_delete_permission(self, request, obj=None):
return obj is not None
7 changes: 4 additions & 3 deletions src/hope_dedup_engine/apps/api/deduplication/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ def find_duplicates(deduplication_set_id: str, serialized_lock: str) -> None:
deduplication_set.state = deduplication_set.State.CLEAN
deduplication_set.save()

if lock_enabled:
lock.release()

except Exception:
deduplication_set.state = DeduplicationSet.State.ERROR
deduplication_set.save()
raise

finally:
if lock_enabled:
lock.release()
2 changes: 1 addition & 1 deletion src/hope_dedup_engine/apps/api/models/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Config(models.Model):
)

def __str__(self) -> str:
return " | ".join(
return f"{self.pk}: " + " | ".join(
f"{field.name}: {getattr(self, field.name)}"
for field in self._meta.fields
if field.name not in ("id",)
Expand Down
9 changes: 9 additions & 0 deletions src/hope_dedup_engine/apps/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,12 @@ class DownloaderKeyError(Exception):
def __init__(self, key: str) -> None:
self.key = key
super().__init__(f"Downloader key '{key}' does not exist.")


class NotCompliantImageError(Exception):
"""
Exception raised when an image is not compliant with the expected parameters.
"""

def __init__(self, message: str) -> None:
super().__init__(message)
Original file line number Diff line number Diff line change
Expand Up @@ -125,22 +125,24 @@ def find_duplicates(self) -> Generator[tuple[str, str, float], None, None]:
encodings_all = self._load_encodings_all()

for path1, path2 in combinations(existed_images_name, 2):
min_distance = self.face_distance_threshold
encodings1 = encodings_all.get(path1)
encodings2 = encodings_all.get(path2)
if encodings1 is None or encodings2 is None:
continue

min_distance = None
for encoding1 in encodings1:
if (
current_min := min(
face_recognition.face_distance(encodings2, encoding1)
)
) < min_distance:
distances = face_recognition.face_distance(encodings2, encoding1)
current_min = min(distances) if np.any(distances) else 0
if min_distance is None or current_min < min_distance:
min_distance = current_min

if min_distance < self.face_distance_threshold:
if (
min_distance is not None
and min_distance < self.face_distance_threshold
):
yield (path1, path2, round(min_distance, 5))

except Exception as e:
self.logger.exception(
"Error finding duplicates for images %s", self.filenames
Expand Down
56 changes: 41 additions & 15 deletions src/hope_dedup_engine/apps/faces/services/image_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
from constance import config

from hope_dedup_engine.apps.core.exceptions import NotCompliantImageError
from hope_dedup_engine.apps.faces.managers import DNNInferenceManager, StorageManager


Expand Down Expand Up @@ -97,19 +98,19 @@ def _get_face_detections_dnn(
# Decode image from binary buffer to 3D numpy array (height, width, channels of BlueGreeRed color space)
image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
(h, w) = image.shape[:2]
_h, _w = (
self.blob_from_image_cfg.shape["height"],
self.blob_from_image_cfg.shape["width"],
)
if h < _h or w < _w:
raise NotCompliantImageError(
f"Image {filename} too small: '{h}x{w}'. It needs to be at least '{_h}x{_w}'."
)

# Create a blob (4D tensor) from the image
blob = cv2.dnn.blobFromImage(
image=cv2.resize(
image,
dsize=(
self.blob_from_image_cfg.shape["height"],
self.blob_from_image_cfg.shape["width"],
),
),
size=(
self.blob_from_image_cfg.shape["height"],
self.blob_from_image_cfg.shape["width"],
),
image=cv2.resize(image, dsize=(_h, _w)),
size=(_h, _w),
scalefactor=self.blob_from_image_cfg.scale_factor,
mean=self.blob_from_image_cfg.mean_values,
)
Expand Down Expand Up @@ -147,6 +148,30 @@ def _get_face_detections_dnn(
raise e
return face_regions

def _preprocess_image(self, filename: str) -> np.ndarray:
"""
This function retrieves an image from the 'images' storage, reads it as an array of bytes,
and decodes it into a color image.

The image's color space is first converted from BGR (Blue, Green, Red) to YUV. Histogram equalization is then
applied to the Y channel (luminance) to enhance the contrast of the image.

Finally, the image is converted to RGB color space for further processing.

Args:
filename (str): The filename of the image to preprocess.

Returns:
np.ndarray: The preprocessed image as a NumPy array in RGB format.
"""
with self.storages.get_storage("images").open(filename, "rb") as img_file:
img_array = np.asarray(bytearray(img_file.read()), dtype=np.uint8)
image = cv2.cvtColor(
cv2.imdecode(img_array, cv2.IMREAD_COLOR), cv2.COLOR_BGR2YUV
)
image[:, :, 0] = cv2.equalizeHist(image[:, :, 0])
return cv2.cvtColor(image, cv2.COLOR_YUV2RGB)

def encode_face(self, filename: str, encodings_filename: str) -> None:
"""
Encode faces detected in an image and save the encodings to storage.
Expand All @@ -156,19 +181,20 @@ def encode_face(self, filename: str, encodings_filename: str) -> None:
encodings_filename (str): The filename to save the face encodings.
"""
try:
with self.storages.get_storage("images").open(filename, "rb") as img_file:
image = face_recognition.load_image_file(img_file)
encodings: list[np.ndarray[np.float32, Any]] = []
image = self._preprocess_image(filename)
face_regions = self._get_face_detections_dnn(filename)
if not face_regions:
self.logger.warning("No face regions detected in image %s", filename)
raise NotCompliantImageError(
f"No face regions detected in image '{filename}'."
)
else:
for region in face_regions:
if isinstance(region, (list, tuple)) and len(region) == 4:
top, right, bottom, left = region
face_encodings = face_recognition.face_encodings(
image,
[(top, right, bottom, left)],
[(right, bottom, left, top)],
num_jitters=self.face_encodings_cfg.num_jitters,
model=self.face_encodings_cfg.model,
)
Expand Down
13 changes: 7 additions & 6 deletions src/hope_dedup_engine/config/fragments/constance.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"tuple_field",
),
"FACE_DETECTION_CONFIDENCE": (
0.5,
0.7,
"""
Specifies the minimum confidence score required for a detected face to be considered valid. Detections
with confidence scores below this threshold are discarded as likely false positives.
Expand All @@ -67,7 +67,7 @@
int,
),
"FACE_ENCODINGS_MODEL": (
"small",
"large",
"""
Specifies the model type used for encoding face landmarks. It can be either 'small' which is faster and
detects only 5 key facial landmarks, or 'large' which is more precise and identifies 68 key facial landmarks
Expand All @@ -76,11 +76,12 @@
"face_encodings_model",
),
"FACE_DISTANCE_THRESHOLD": (
0.4,
0.26,
"""
Specifies the maximum allowable distance between two face embeddings for them to be considered a match. It helps
determine if two faces belong to the same person by setting a threshold for similarity. Lower values result in
stricter matching, while higher values allow for more lenient matches.
Specifies the maximum allowable distance between two face embeddings for them to be considered a match.
This tolerance threshold is crucial for assessing whether two faces belong to the same individual,
as it establishes the similarity limit. Lower values result in stricter matching, while higher values allow
for more lenient matches.
""",
float,
),
Expand Down
Binary file modified tests/extras/demoapp/demo_images/Aaron_Eckhart_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Aaron_Guiel_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Aaron_Peirsol_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Aaron_Peirsol_0002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Cathy_Freeman_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Cathy_Freeman_0002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Ziwang_Xu_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/Zoe_Ball_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/extras/demoapp/demo_images/too_small.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/extras/demoapp/demo_images/without_face.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 11 additions & 9 deletions tests/extras/demoapp/scripts/base_case
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env bash

start_time=$(date +%s)

source "$(dirname "0")/.common"

if [[ $# -ne 1 ]] ; then
Expand All @@ -9,16 +11,16 @@ fi

./create_deduplication_set "$1" | jq -r ".id" | xargs ./use_deduplication_set

./create_image Aaron_Eckhart_0001.jpg
./create_image Aaron_Guiel_0001.jpg
./create_image Aaron_Peirsol_0001.jpg
./create_image Aaron_Peirsol_0002.jpg
./create_image Cathy_Freeman_0001.jpg
./create_image Cathy_Freeman_0002.jpg
./create_image without_face.jpg
./create_image Ziwang_Xu_0001.jpg
./create_image Zoe_Ball_0001.jpg
for file in ../demo_images/*.{jpg,png}; do
if [[ -f "$file" ]]; then
./create_image $(basename "$file")
fi
done


./process_deduplication_set

./show_duplicates

duration=$(( $(date +%s) - start_time ))
echo "Duration: $duration seconds."
2 changes: 1 addition & 1 deletion tests/faces/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def mock_image_processor(
@pytest.fixture
def image_bytes_io():
img_byte_arr = BytesIO()
image = Image.new("RGB", (100, 100), color="red")
image = Image.new("RGB", (300, 300), color="red")
image.save(img_byte_arr, format="JPEG")
img_byte_arr.seek(0)
img_byte_arr.fake_open = lambda *_: BytesIO(img_byte_arr.getvalue())
Expand Down
10 changes: 5 additions & 5 deletions tests/faces/faces_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
["ignore_file.jpg", "ignore_file2.jpg"],
["ignore_file4.jpg", "ignore_file3.jpg"],
]
FACE_DISTANCE_THRESHOLD = 0.4
FACE_DISTANCE_THRESHOLD = 0.26

DNN_FILE = {
"name": FILENAME,
Expand Down Expand Up @@ -44,8 +44,8 @@
}
FACE_REGIONS_INVALID: Final[list[list[tuple[int, int, int, int]]]] = [[], [(0, 0, 10)]]
FACE_REGIONS_VALID: Final[list[tuple[int, int, int, int]]] = [
(10, 10, 20, 20),
(30, 30, 40, 40),
(40, 40, 80, 80),
(120, 120, 160, 160),
]
BLOB_FROM_IMAGE_SCALE_FACTOR: Final[float] = 1.0
BLOB_FROM_IMAGE_MEAN_VALUES: Final[tuple[float, float, float]] = (104.0, 177.0, 123.0)
Expand All @@ -56,8 +56,8 @@
(0, 0, 0.15, 0.1, 0.1, 0.2, 0.2), # with confidence 0.15 -> invalid detection
]
IMAGE_SIZE: Final[tuple[int, int, int]] = (
100,
100,
400,
400,
3,
) # Size of the image after decoding (h, w, number of channels)
RESIZED_IMAGE_SIZE: Final[tuple[int, int, int]] = (
Expand Down
6 changes: 3 additions & 3 deletions tests/faces/test_duplication_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,12 @@ def open_mock(filename, mode="rb"):
(
"test_file.jpg",
"test_file2.jpg",
0.36,
), # config.FACE_DISTANCE_THRESHOLD + 0.04
0.22,
), # config.FACE_DISTANCE_THRESHOLD - 0.04
(
"test_file.jpg",
"test_file3.jpg",
0.2,
0.06,
), # config.FACE_DISTANCE_THRESHOLD - 0.2
# last pair will not be included in the result because the distance is greater than the threshold
# ("test_file2.jpg", "test_file3.jpg", 0.44), # config.FACE_DISTANCE_THRESHOLD + 0.04
Expand Down
7 changes: 1 addition & 6 deletions tests/faces/test_image_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,13 @@ def test_encode_face(mock_image_processor, image_bytes_io, face_regions):
patch.object(
mock_image_processor, "_get_face_detections_dnn", return_value=face_regions
) as mock_get_face_detections_dnn,
patch.object(face_recognition, "load_image_file") as mock_load_image_file,
patch.object(face_recognition, "face_encodings") as mock_face_encodings,
):
mock_image_processor.encode_face(FILENAME, FILENAME_ENCODED)

mock_get_face_detections_dnn.assert_called_once()
mocked_image_open.assert_called_with(FILENAME, "rb")
assert mocked_image_open.side_effect == image_bytes_io.fake_open
mock_load_image_file.assert_called()

if face_regions == FACE_REGIONS_VALID:
mocked_encoded_open.assert_called_with(FILENAME_ENCODED, "wb")
Expand All @@ -152,10 +150,7 @@ def test_encode_face(mock_image_processor, image_bytes_io, face_regions):

@pytest.mark.parametrize(
"method, exception_str",
(
(str("load_image_file"), "Test load_image_file exception"),
(str("face_encodings"), "Test face_encodings exception"),
),
((str("face_encodings"), "Test face_encodings exception"),),
)
def test_encode_face_exception_handling(
mock_image_processor, mock_net, method: str, exception_str
Expand Down
Loading