Skip to content

Commit

Permalink
fix(documents): delete orphan harvested documents
Browse files Browse the repository at this point in the history
* Closes #3776.

Co-Authored-by: Pascal Repond <[email protected]>
  • Loading branch information
PascalRepond committed Nov 12, 2024
1 parent 1fdc0fb commit 6532ff5
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 1 deletion.
8 changes: 8 additions & 0 deletions rero_ils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,14 @@ def _(x):
"kwargs": {"delete": True},
"enabled": False,
},
"delete-orphan-harvested": {
"task": "rero_ils.modules.documents.tasks.delete_orphan_harvested",
"schedule": crontab(
minute=0, hour=5, day_of_week=6
), # Every Sunday at 05:00 UTC,
"kwargs": {"delete": True},
"enabled": False,
},
# "mef-harvester": {
# "task": "rero_ils.modules.apiharvester.tasks.harvest_records",
# "schedule": timedelta(minutes=60),
Expand Down
40 changes: 40 additions & 0 deletions rero_ils/modules/documents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,46 @@ def reindex_document(pid):
Document.get_record_by_pid(pid).reindex()


@shared_task(ignore_result=True)
def delete_orphan_harvested(delete=False, verbose=False):
"""Delete orphan harvested documents.
:param delete: if True delete from DB and ES.
:param verbose: Verbose print.
:returns: count of deleted documents.
"""
query = (
DocumentsSearch()
.filter("term", harvested=True)
.exclude("exists", field="holdings")
)
pids = [hit.pid for hit in query.source("pid").scan()]
count = len(pids)

if verbose:
click.secho(f"Orphan harvested documents count: {count}", fg="yellow")
for pid in pids:
if doc := Document.get_record_by_pid(pid):
if verbose:
click.secho(f"Deleting orphan harvested: {pid}", fg="yellow")
if delete:
try:
# only delete documents that have no links to me, only reason not to delete should be 'harvested'
if doc.reasons_not_to_delete() == {"others": {"harvested": True}}:
doc.pop("harvested")
doc.replace(doc, dbcommit=True, reindex=True)
doc.delete(dbcommit=True, delindex=True)
except Exception:
count -= 1
msg = f"COULD NOT DELETE ORPHAN HARVESTED: {pid} {doc.reasons_not_to_delete()}"
if verbose:
click.secho(f"ERROR: {msg}", fg="red")
current_app.logger.warning(msg)

set_timestamp("delete_orphan_harvested", msg={"deleted": count})
return count


@shared_task(ignore_result=True)
def delete_drafts(days=1, delete=False, verbose=False):
"""Delete drafts.
Expand Down
18 changes: 17 additions & 1 deletion tests/ui/documents/test_documents_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
document_id_fetcher,
)
from rero_ils.modules.documents.models import DocumentIdentifier
from rero_ils.modules.documents.tasks import delete_drafts
from rero_ils.modules.documents.tasks import delete_drafts, delete_orphan_harvested
from rero_ils.modules.ebooks.tasks import create_records
from rero_ils.modules.entities.models import EntityType
from rero_ils.modules.entities.remote_entities.api import (
Expand Down Expand Up @@ -421,3 +421,19 @@ def test_document_delete_draft(app, document_chinese_data):
doc["_draft"] = True
doc.update(data=doc, dbcommit=True, reindex=True)
assert delete_drafts(days=0, delete=True) == 1


def test_document_delete_orphan_harvested(
app, document_data, holding_lib_sion_electronic
):
"""Test document delete orphan harvested.
Make sure that ebooks with electronic holdings (holding_lib_sion_electronic are not deleted).
"""
doc = Document.create(
data=document_data, delete_pid=True, dbcommit=True, reindex=True
)
assert delete_orphan_harvested(delete=True) == 0
doc["harvested"] = True
doc.update(data=doc, dbcommit=True, reindex=True)
assert delete_orphan_harvested(delete=True) == 1

0 comments on commit 6532ff5

Please sign in to comment.