Skip to content

Commit

Permalink
chg ! save encodings, findings, silented in deduplicationset
Browse files Browse the repository at this point in the history
  • Loading branch information
vitali-yanushchyk-valor committed Dec 24, 2024
1 parent 004495d commit 27ab517
Show file tree
Hide file tree
Showing 16 changed files with 547 additions and 377 deletions.
2 changes: 1 addition & 1 deletion src/hope_dedup_engine/apps/api/admin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .config import ConfigAdmin # noqa
from .deduplicationset import DeduplicationSetAdmin # noqa
from .duplicate import DuplicateAdmin # noqa
from .finding import FindingAdmin # noqa
from .hdetoken import HDETokenAdmin # noqa
from .image import ImageAdmin # noqa
from .jobs import DedupJob # noqa
106 changes: 50 additions & 56 deletions src/hope_dedup_engine/apps/api/admin/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,16 @@
from typing import Any

from django.contrib import messages
from django.contrib.admin import ModelAdmin, register, site
from django.core.exceptions import ValidationError
from django.contrib.admin import ModelAdmin, register
from django.db import models
from django.http import HttpRequest, HttpResponse
from django.shortcuts import redirect, render
from django.urls import path, reverse

from admin_extra_buttons.api import button
from admin_extra_buttons.mixins import ExtraButtonsMixin
from django_svelte_jsoneditor.widgets import SvelteJSONEditorWidget

from hope_dedup_engine.apps.api.forms import EditSchemaForm
from hope_dedup_engine.apps.api.models import Config
from hope_dedup_engine.apps.api.utils.shema_manager import SchemaManager
from hope_dedup_engine.apps.api.validators import DefaultValidatingValidator
from hope_dedup_engine.utils.security import is_root


@register(Config)
Expand All @@ -31,15 +25,15 @@ class ConfigAdmin(ExtraButtonsMixin, ModelAdmin):
}
}

def get_changeform_initial_data(self, request: HttpRequest) -> dict[str, str]:
initial_data = super().get_changeform_initial_data(request)
initial_data["settings"] = {}
try:
schema = SchemaManager.get_or_create()
DefaultValidatingValidator(schema).validate(initial_data["settings"])
except ValidationError as e:
self.message_user(request, e.message, level=messages.ERROR)
return initial_data
# def get_changeform_initial_data(self, request: HttpRequest) -> dict[str, str]:
# initial_data = super().get_changeform_initial_data(request)
# initial_data["settings"] = {}
# try:
# schema = SchemaManager.get_or_create()
# DefaultValidatingValidator(schema).validate(initial_data["settings"])
# except ValidationError as e:
# self.message_user(request, e.message, level=messages.ERROR)
# return initial_data

def get_urls(self):
urls = super().get_urls()
Expand All @@ -49,11 +43,11 @@ def get_urls(self):
self.admin_site.admin_view(self.confirm_save),
name="confirm_save_config",
),
path(
"change-settings-schema/",
self.admin_site.admin_view(self.change_settings_schema),
name="change_settings_schema",
),
# path(
# "change-settings-schema/",
# self.admin_site.admin_view(self.change_settings_schema),
# name="change_settings_schema",
# ),
]
return custom_urls + urls

Expand Down Expand Up @@ -92,40 +86,40 @@ def confirm_save(self, request, object_id) -> HttpResponse: # pragma: no cover
},
)

@button(permission=is_root)
def change_settings_schema(
self, request: HttpRequest
) -> HttpResponse: # pragma: no cover
context = {
"opts": self.model._meta,
"site_header": site.site_header,
"title": "Change settings shema",
"trail_label": "Settings schema",
"has_view_permission": self.has_view_permission(request),
}
# @button(permission=is_root)
# def change_settings_schema(
# self, request: HttpRequest
# ) -> HttpResponse: # pragma: no cover
# context = {
# "opts": self.model._meta,
# "site_header": site.site_header,
# "title": "Change settings shema",
# "trail_label": "Settings schema",
# "has_view_permission": self.has_view_permission(request),
# }

if request.method == "POST":
form = EditSchemaForm(request.POST)
if form.is_valid():
try:
SchemaManager.save(form.cleaned_data["schema"])
except ValidationError as e:
self.message_user(request, e.message, level=messages.ERROR)
else:
self.message_user(request, "Schema has been updated.")
return redirect(reverse("admin:api_config_changelist"))
else:
try:
form = EditSchemaForm(initial={"schema": SchemaManager.get_or_create()})
except ValidationError as e:
self.message_user(request, e.message, level=messages.ERROR)
return redirect(reverse("admin:api_config_changelist"))
# if request.method == "POST":
# form = EditSchemaForm(request.POST)
# if form.is_valid():
# try:
# SchemaManager.save(form.cleaned_data["schema"])
# except ValidationError as e:
# self.message_user(request, e.message, level=messages.ERROR)
# else:
# self.message_user(request, "Schema has been updated.")
# return redirect(reverse("admin:api_config_changelist"))
# else:
# try:
# form = EditSchemaForm(initial={"schema": SchemaManager.get_or_create()})
# except ValidationError as e:
# self.message_user(request, e.message, level=messages.ERROR)
# return redirect(reverse("admin:api_config_changelist"))

return render(
request,
"admin/api/config/change_settings_schema.html",
{
"form": form,
**context,
},
)
# return render(
# request,
# "admin/api/config/change_settings_schema.html",
# {
# "form": form,
# **context,
# },
# )
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from adminfilters.filters import DjangoLookupFilter, NumberFilter
from adminfilters.mixin import AdminFiltersMixin

from hope_dedup_engine.apps.api.models import Duplicate
from hope_dedup_engine.apps.api.models import Finding


@register(Duplicate)
class DuplicateAdmin(AdminFiltersMixin, ModelAdmin):
@register(Finding)
class FindingAdmin(AdminFiltersMixin, ModelAdmin):
list_display = (
"id",
"deduplication_set",
Expand Down
19 changes: 14 additions & 5 deletions src/hope_dedup_engine/apps/api/deduplication/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from hope_dedup_engine.apps.api.deduplication.registry import DuplicateKeyPair
from hope_dedup_engine.apps.api.models import DeduplicationSet
from hope_dedup_engine.apps.faces.services import FacialDetector
from hope_dedup_engine.constants import is_facial_error


class DuplicateFaceFinder:
Expand Down Expand Up @@ -32,11 +33,19 @@ def run(
# options.apply_config_overrides(self.deduplication_set.config.settings)
# ignored key pairs are not handled correctly in DuplicationDetector
detector = FacialDetector(
tuple[str](filename_to_reference_pk.keys()), options=options
self.deduplication_set.pk,
tuple[str](filename_to_reference_pk.keys()),
options=options,
)
for first_filename, second_filename, distance in detector.find_duplicates(
tracker
# tracker
):
yield filename_to_reference_pk[first_filename], filename_to_reference_pk[
second_filename
], 1 - distance
yield (
filename_to_reference_pk[first_filename],
(
filename_to_reference_pk[second_filename]
if second_filename in filename_to_reference_pk
else second_filename
),
distance if is_facial_error(distance) else (1 - distance),
)
125 changes: 65 additions & 60 deletions src/hope_dedup_engine/apps/api/deduplication/process.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,62 @@
from collections.abc import Callable
from functools import partial

from celery import shared_task

from hope_dedup_engine.apps.api.deduplication.registry import (
DuplicateFinder,
DuplicateKeyPair,
from hope_dedup_engine.apps.api.deduplication.registry import ( # DuplicateFinder,; DuplicateKeyPair,
get_finders,
)
from hope_dedup_engine.apps.api.models import DedupJob, DeduplicationSet, Duplicate
from hope_dedup_engine.apps.api.models import DedupJob, DeduplicationSet, Finding
from hope_dedup_engine.apps.api.utils.notification import send_notification
from hope_dedup_engine.apps.api.utils.progress import track_progress_multi


def _sort_keys(pair: DuplicateKeyPair) -> DuplicateKeyPair:
first, second, score = pair
return *sorted((first, second)), score


def _save_duplicates(
finder: DuplicateFinder,
deduplication_set: DeduplicationSet,
tracker: Callable[[int], None],
) -> None:
reference_pk_to_filename_mapping = dict(
deduplication_set.image_set.values_list("reference_pk", "filename")
)
ignored_filename_pairs = frozenset(
map(
tuple,
map(
sorted,
deduplication_set.ignoredfilenamepair_set.values_list(
"first", "second"
),
),
)
)

ignored_reference_pk_pairs = frozenset(
deduplication_set.ignoredreferencepkpair_set.values_list("first", "second")
)

for first, second, score in map(_sort_keys, finder.run(tracker)):
first_filename, second_filename = sorted(
(
reference_pk_to_filename_mapping[first],
reference_pk_to_filename_mapping[second],
)
)
ignored = (first, second) in ignored_reference_pk_pairs or (
first_filename,
second_filename,
) in ignored_filename_pairs
if not ignored:
duplicate, _ = Duplicate.objects.get_or_create(
deduplication_set=deduplication_set,
first_reference_pk=first,
second_reference_pk=second,
)
duplicate.score += score * finder.weight
duplicate.save()
# def _sort_keys(pair: DuplicateKeyPair) -> DuplicateKeyPair:
# first, second, score = pair
# return *sorted((first, second)), score


# def _save_duplicates(
# finder: DuplicateFinder,
# deduplication_set: DeduplicationSet,
# tracker: Callable[[int], None],
# ) -> None:
# reference_pk_to_filename_mapping = dict(
# deduplication_set.image_set.values_list("reference_pk", "filename")
# )
# ignored_filename_pairs = frozenset(
# map(
# tuple,
# map(
# sorted,
# deduplication_set.ignoredfilenamepair_set.values_list(
# "first", "second"
# ),
# ),
# )
# )

# ignored_reference_pk_pairs = frozenset(
# deduplication_set.ignoredreferencepkpair_set.values_list("first", "second")
# )

# for first, second, score in map(_sort_keys, finder.run(tracker)):
# first_filename, second_filename = sorted(
# (
# reference_pk_to_filename_mapping[first],
# reference_pk_to_filename_mapping[second],
# )
# )
# ignored = (first, second) in ignored_reference_pk_pairs or (
# first_filename,
# second_filename,
# ) in ignored_filename_pairs
# if not ignored:
# duplicate, _ = Duplicate.objects.get_or_create(
# deduplication_set=deduplication_set,
# first_reference_pk=first,
# second_reference_pk=second,
# )
# duplicate.score += score * finder.weight
# duplicate.save()


HOUR = 60 * 60
Expand All @@ -82,19 +78,28 @@ def find_duplicates(dedup_job_id: int, version: int) -> None:
send_notification(deduplication_set.notification_url)

# clean results
Duplicate.objects.filter(deduplication_set=deduplication_set).delete()
Finding.objects.filter(deduplication_set=deduplication_set).delete()

weight_total = 0
for finder, tracker in zip(
# for finder, tracker in zip(
for finder, _ in zip(
get_finders(deduplication_set),
track_progress_multi(partial(update_job_progress, dedup_job)),
):
_save_duplicates(finder, deduplication_set, tracker)
# _save_duplicates(finder, deduplication_set, tracker)
weight_total += finder.weight

for duplicate in deduplication_set.duplicate_set.all():
duplicate.score /= weight_total
duplicate.save()
for finding in deduplication_set.finding_set.all():
finding.score /= weight_total
finding.save()

for finder, tracker in zip(
get_finders(deduplication_set),
track_progress_multi(partial(update_job_progress, dedup_job)),
):
for first, second, score in finder.run(tracker):
finding = (first, second, score * finder.weight)
deduplication_set.update_findings(finding)

deduplication_set.state = deduplication_set.State.CLEAN
deduplication_set.save()
Expand Down
Loading

0 comments on commit 27ab517

Please sign in to comment.