Skip to content

Commit

Permalink
chg ! use chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
vitali-yanushchyk-valor committed Dec 27, 2024
1 parent ca3a76e commit 54a9642
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 243 deletions.
2 changes: 2 additions & 0 deletions src/hope_dedup_engine/apps/api/admin/finding.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ class FindingAdmin(AdminFiltersMixin, ModelAdmin):
"id",
"deduplication_set",
"score",
"error",
"first_reference_pk",
"second_reference_pk",
)
list_filter = (
("deduplication_set", AutoCompleteFilter),
("score", NumberFilter),
("error", NumberFilter),
DjangoLookupFilter,
)

Expand Down
67 changes: 32 additions & 35 deletions src/hope_dedup_engine/apps/api/deduplication/adapters.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from collections.abc import Callable, Generator

from constance import config

from hope_dedup_engine.apps.api.deduplication.registry import DuplicateKeyPair
from hope_dedup_engine.apps.api.models import DeduplicationSet
from hope_dedup_engine.apps.faces.services import FacialDetector
from hope_dedup_engine.constants import is_facial_error


class DuplicateFaceFinder:
Expand All @@ -18,34 +14,35 @@ def __init__(self, deduplication_set: DeduplicationSet):
def run(
self, tracker: Callable[[int], None] | None = None
) -> Generator[DuplicateKeyPair, None, None]:
filename_to_reference_pk = {
filename: reference_pk
for reference_pk, filename in self.deduplication_set.image_set.values_list(
"reference_pk", "filename"
)
}
options = {
"detector_backend": config.FACE_DETECTOR_MODEL,
"model_name": config.FACIAL_RECOGNITION_MODEL,
}
# options = ConfigDefaults()
# if self.deduplication_set.config:
# options.apply_config_overrides(self.deduplication_set.config.settings)
# ignored key pairs are not handled correctly in DuplicationDetector
detector = FacialDetector(
self.deduplication_set.pk,
tuple[str](filename_to_reference_pk.keys()),
options=options,
)
for first_filename, second_filename, distance in detector.find_duplicates(
# tracker
):
yield (
filename_to_reference_pk[first_filename],
(
filename_to_reference_pk[second_filename]
if second_filename in filename_to_reference_pk
else second_filename
),
distance if is_facial_error(distance) else (1 - distance),
)
...
# filename_to_reference_pk = {
# filename: reference_pk
# for reference_pk, filename in self.deduplication_set.image_set.values_list(
# "reference_pk", "filename"
# )
# }
# options = {
# "detector_backend": config.FACE_DETECTOR_MODEL,
# "model_name": config.FACIAL_RECOGNITION_MODEL,
# }
# # options = ConfigDefaults()
# # if self.deduplication_set.config:
# # options.apply_config_overrides(self.deduplication_set.config.settings)
# # ignored key pairs are not handled correctly in DuplicationDetector
# detector = FacialDetector(
# self.deduplication_set.pk,
# tuple[str](filename_to_reference_pk.keys()),
# options=options,
# )
# for first_filename, second_filename, distance in detector.find_duplicates(
# # tracker
# ):
# yield (
# filename_to_reference_pk[first_filename],
# (
# filename_to_reference_pk[second_filename]
# if second_filename in filename_to_reference_pk
# else second_filename
# ),
# distance if is_facial_error(distance) else (1 - distance),
# )
45 changes: 37 additions & 8 deletions src/hope_dedup_engine/apps/api/deduplication/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@

from django.db.models import F

from celery import shared_task
from celery import chord, shared_task
from constance import config as constance_config

from hope_dedup_engine.apps.api.deduplication.registry import ( # DuplicateFinder,; DuplicateKeyPair,
get_finders,
)
from hope_dedup_engine.apps.api.models import DedupJob, DeduplicationSet, Finding
from hope_dedup_engine.apps.api.utils.notification import send_notification
from hope_dedup_engine.apps.api.utils.progress import track_progress_multi
from hope_dedup_engine.apps.faces.celery_tasks import callback_encodings, encode_chunk
from hope_dedup_engine.apps.faces.services.facial import get_chunks
from hope_dedup_engine.constants import CHUNK_SIZE

# def _sort_keys(pair: DuplicateKeyPair) -> DuplicateKeyPair:
# first, second, score = pair
Expand Down Expand Up @@ -68,6 +72,14 @@ def update_job_progress(job: DedupJob, progress: int) -> None:
job.progress = progress
job.save()

# config = {"options": {**depface_options}, "path": path,
# "dedupe_threshold": dedupe_threshold,
# "report_threshold": report_threshold,
# "symmetric": symmetric,
# "edges": edges,
# "reset": reset}
# process_dataset.delay(config)


@shared_task(soft_time_limit=0.5 * HOUR, time_limit=1 * HOUR)
def find_duplicates(dedup_job_id: int, version: int) -> None:
Expand All @@ -79,7 +91,18 @@ def find_duplicates(dedup_job_id: int, version: int) -> None:
deduplication_set.save()
send_notification(deduplication_set.notification_url)

config = {
"deduplication_set_id": deduplication_set.pk,
"dedupe_threshold": constance_config.FACE_DISTANCE_THRESHOLD,
# "reset": True,
"options": {
"detector_backend": constance_config.FACE_DETECTOR_MODEL,
"model_name": constance_config.FACIAL_RECOGNITION_MODEL,
},
}

# clean results
# if config["reset"]:
Finding.objects.filter(deduplication_set=deduplication_set).delete()

weight_total = 0
Expand All @@ -93,13 +116,19 @@ def find_duplicates(dedup_job_id: int, version: int) -> None:

deduplication_set.finding_set.update(score=F("score") / weight_total)

for finder, tracker in zip(
get_finders(deduplication_set),
track_progress_multi(partial(update_job_progress, dedup_job)),
):
for first, second, score in finder.run(tracker):
finding = (first, second, score * finder.weight)
deduplication_set.update_findings(finding)
files = deduplication_set.image_set.values_list("filename", flat=True)

chunks = get_chunks(files, (len(files) // CHUNK_SIZE) + 1)
tasks = [encode_chunk.s(chunk, config) for n, chunk in enumerate(chunks)]
chord(tasks)(callback_encodings.s(config=config))

# for finder, tracker in zip(
# get_finders(deduplication_set),
# track_progress_multi(partial(update_job_progress, dedup_job)),
# ):
# for first, second, score in finder.run(tracker):
# finding = (first, second, score * finder.weight)
# deduplication_set.update_findings(finding)

deduplication_set.state = deduplication_set.State.CLEAN
deduplication_set.save(update_fields=["state"])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 5.1.4 on 2024-12-24 06:31
# Generated by Django 5.1.4 on 2024-12-27 05:37

import django.db.models.deletion
from django.db import migrations, models
Expand Down Expand Up @@ -63,6 +63,7 @@ class Migration(migrations.Migration):
models.CharField(max_length=100, verbose_name="Second reference"),
),
("score", models.FloatField(default=0)),
("error", models.IntegerField(blank=True, null=True)),
(
"deduplication_set",
models.ForeignKey(
Expand Down
40 changes: 26 additions & 14 deletions src/hope_dedup_engine/apps/api/models/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from django.conf import settings
from django.db import models

from constance import config as constance_config

from hope_dedup_engine.apps.faces.managers import StorageManager
from hope_dedup_engine.apps.security.models import ExternalSystem
from hope_dedup_engine.types import EncodingType, FindingType, SilencedType
Expand Down Expand Up @@ -110,12 +112,19 @@ def update_encodings(self, encodings: EncodingType) -> None:
self.encodings.update(encodings)
self.save()

def update_findings(self, finding: FindingType) -> None:
Finding.objects.get_or_create(
deduplication_set=self,
**dict(
zip(("first_reference_pk", "second_reference_pk", "score"), finding)
),
def update_findings(self, findings: FindingType) -> None:
Finding.objects.bulk_create(
[
Finding(
deduplication_set=self,
first_reference_pk=f[0],
second_reference_pk=f[1],
score=f[2],
error=f[3],
)
for f in findings
],
ignore_conflicts=True,
)

def update_silenced(self, silenced: SilencedType) -> None:
Expand All @@ -125,28 +134,31 @@ def update_silenced(self, silenced: SilencedType) -> None:
def get_files(self) -> list[str]:
"""Retrieve all valid image files"""
patterns = ("*.png", "*.jpg", "*.jpeg")
db_filenames = set(self.image_set.values_list("filename", flat=True))
st_images = self.storages.get_storage("images")

return [
st_images.url(file)
for file in st_images.listdir("")[1]
if any(fnmatch(file, pattern) for pattern in patterns)
if file in db_filenames
and any(fnmatch(file, pattern) for pattern in patterns)
]

def get_encoding_config(self) -> dict[str, str | int | float | bool]:
return {
"model_name": self.options.get("model_name"),
"detector_backend": self.options.get("detector_backend"),
"detector_backend": constance_config.FACE_DETECTOR_MODEL,
"model_name": constance_config.FACIAL_RECOGNITION_MODEL,
}

def get_dedupe_config(self) -> dict[str, str | int | float | bool]:
return {
"threshold": self.options.get("threshold"),
"model_name": self.options.get("model_name"),
"detector_backend": self.options.get("detector_backend"),
"threshold": constance_config.FACE_DISTANCE_THRESHOLD,
"detector_backend": constance_config.FACE_DETECTOR_MODEL,
"model_name": constance_config.FACIAL_RECOGNITION_MODEL,
}

def get_dedupe_threshold(self):
return self.config["dedupe_threshold"]
return constance_config.FACE_DISTANCE_THRESHOLD


class Image(models.Model):
Expand Down Expand Up @@ -186,7 +198,7 @@ class Finding(models.Model):
second_reference_pk = models.CharField(
max_length=REFERENCE_PK_LENGTH, verbose_name="Second reference"
)
score = models.FloatField(default=0, validators=[])
score = models.FloatField(default=0, validators=[], verbose_name="Similarity Score")
error = models.IntegerField(null=True, blank=True)

class Meta:
Expand Down
Loading

0 comments on commit 54a9642

Please sign in to comment.