From 86b09b19874fb73cf40438a575cc26e14e7bedc4 Mon Sep 17 00:00:00 2001 From: Pascal Repond Date: Tue, 12 Nov 2024 09:12:33 +0100 Subject: [PATCH] fix(documents): delete orphan harvested documents * Closes #3776. Co-Authored-by: Pascal Repond --- rero_ils/config.py | 8 +++++ rero_ils/modules/documents/tasks.py | 40 ++++++++++++++++++++++++ tests/ui/documents/test_documents_api.py | 18 ++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/rero_ils/config.py b/rero_ils/config.py index d5ed19b187..1f3f146f75 100644 --- a/rero_ils/config.py +++ b/rero_ils/config.py @@ -518,6 +518,14 @@ def _(x): "kwargs": {"delete": True}, "enabled": False, }, + "delete-orphan-harvested": { + "task": "rero_ils.modules.documents.tasks.delete_orphan_harvested", + "schedule": crontab( + minute=0, hour=5, day_of_week=6 + ), # Every Sunday at 05:00 UTC, + "kwargs": {"delete": True}, + "enabled": False, + }, # "mef-harvester": { # "task": "rero_ils.modules.apiharvester.tasks.harvest_records", # "schedule": timedelta(minutes=60), diff --git a/rero_ils/modules/documents/tasks.py b/rero_ils/modules/documents/tasks.py index cb2f7d6da4..7a53ac26f5 100644 --- a/rero_ils/modules/documents/tasks.py +++ b/rero_ils/modules/documents/tasks.py @@ -36,6 +36,46 @@ def reindex_document(pid): Document.get_record_by_pid(pid).reindex() +@shared_task(ignore_result=True) +def delete_orphan_harvested(delete=False, verbose=False): + """Delete orphan harvested documents. + + :param delete: if True delete from DB and ES. + :param verbose: Verbose print. + :returns: count of deleted documents. + """ + query = ( + DocumentsSearch() + .filter("term", harvested=True) + .exclude("exists", field="holdings") + ) + pids = [hit.pid for hit in query.source("pid").scan()] + count = 0 + + if verbose: + click.secho(f"Orphan harvested documents count: {len(pids)}", fg="yellow") + for pid in pids: + if doc := Document.get_record_by_pid(pid): + if verbose: + click.secho(f"Deleting orphan harvested: {pid}", fg="yellow") + if delete: + try: + # only delete documents that have no links to me, only reason not to delete should be 'harvested' + if doc.reasons_not_to_delete() == {"others": {"harvested": True}}: + doc.pop("harvested") + doc.replace(doc, dbcommit=True, reindex=True) + doc.delete(dbcommit=True, delindex=True) + count += 1 + except Exception: + msg = f"COULD NOT DELETE ORPHAN HARVESTED: {pid} {doc.reasons_not_to_delete()}" + if verbose: + click.secho(f"ERROR: {msg}", fg="red") + current_app.logger.warning(msg) + + set_timestamp("delete_orphan_harvested", msg={"deleted": count}) + return count + + @shared_task(ignore_result=True) def delete_drafts(days=1, delete=False, verbose=False): """Delete drafts. diff --git a/tests/ui/documents/test_documents_api.py b/tests/ui/documents/test_documents_api.py index a329337dbf..85608329bf 100644 --- a/tests/ui/documents/test_documents_api.py +++ b/tests/ui/documents/test_documents_api.py @@ -34,7 +34,7 @@ document_id_fetcher, ) from rero_ils.modules.documents.models import DocumentIdentifier -from rero_ils.modules.documents.tasks import delete_drafts +from rero_ils.modules.documents.tasks import delete_drafts, delete_orphan_harvested from rero_ils.modules.ebooks.tasks import create_records from rero_ils.modules.entities.models import EntityType from rero_ils.modules.entities.remote_entities.api import ( @@ -421,3 +421,19 @@ def test_document_delete_draft(app, document_chinese_data): doc["_draft"] = True doc.update(data=doc, dbcommit=True, reindex=True) assert delete_drafts(days=0, delete=True) == 1 + + +def test_document_delete_orphan_harvested( + app, document_data, holding_lib_sion_electronic +): + """Test document delete orphan harvested. + + Make sure that ebooks with electronic holdings (holding_lib_sion_electronic are not deleted). + """ + doc = Document.create( + data=document_data, delete_pid=True, dbcommit=True, reindex=True + ) + assert delete_orphan_harvested(delete=True) == 0 + doc["harvested"] = True + doc.update(data=doc, dbcommit=True, reindex=True) + assert delete_orphan_harvested(delete=True) == 1