Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HostMetrics: Hard auto-cleanup #14255

Merged
merged 4 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions awx/main/management/commands/cleanup_host_metrics.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
from awx.main.models import HostMetric
from django.core.management.base import BaseCommand
from django.conf import settings
from awx.main.tasks.host_metrics import HostMetricTask


class Command(BaseCommand):
"""
Run soft-deleting of HostMetrics
This command provides cleanup task for HostMetric model.
There are two modes, which run in following order:
- soft cleanup
- - Perform soft-deletion of all host metrics last automated 12 months ago or before.
This is the same as issuing a DELETE request to /api/v2/host_metrics/N/ for all host metrics that match the criteria.
- - updates columns delete, deleted_counter and last_deleted
- hard cleanup
- - Permanently erase from the database all host metrics last automated 36 months ago or before.
This operation happens after the soft deletion has finished.
"""

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if you tried assigning this docstring to a help variable as in the Django docs?

https://docs.djangoproject.com/en/4.2/howto/custom-management-commands/

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlanCoding we discussed that previously. The help variable is here below docstring and should be one-liner.
It's not supposed to be long description like the docstring (also according to the doc you posted ^)

help = 'Run soft-deleting of HostMetrics'

def add_arguments(self, parser):
parser.add_argument('--months-ago', type=int, dest='months-ago', action='store', help='Threshold in months for soft-deleting')
help = 'Run soft and hard-deletion of HostMetrics'

def handle(self, *args, **options):
months_ago = options.get('months-ago') or None

if not months_ago:
months_ago = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)

HostMetric.cleanup_task(months_ago)
HostMetricTask().cleanup(soft_threshold=settings.CLEANUP_HOST_METRICS_SOFT_THRESHOLD, hard_threshold=settings.CLEANUP_HOST_METRICS_HARD_THRESHOLD)
18 changes: 0 additions & 18 deletions awx/main/models/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import os.path
from urllib.parse import urljoin

import dateutil.relativedelta
import yaml

# Django
Expand Down Expand Up @@ -890,23 +889,6 @@ def soft_restore(self):
self.deleted = False
self.save(update_fields=['deleted'])

@classmethod
def cleanup_task(cls, months_ago):
try:
months_ago = int(months_ago)
if months_ago <= 0:
raise ValueError()

last_automation_before = now() - dateutil.relativedelta.relativedelta(months=months_ago)

logger.info(f'cleanup_host_metrics: soft-deleting records last automated before {last_automation_before}')
HostMetric.active_objects.filter(last_automation__lt=last_automation_before).update(
deleted=True, deleted_counter=models.F('deleted_counter') + 1, last_deleted=now()
)
settings.CLEANUP_HOST_METRICS_LAST_TS = now()
except (TypeError, ValueError):
logger.error(f"cleanup_host_metrics: months_ago({months_ago}) has to be a positive integer value")


class HostMetricSummaryMonthly(models.Model):
"""
Expand Down
10 changes: 10 additions & 0 deletions awx/main/tasks/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.utils.timezone import now
from rest_framework.fields import DateTimeField


def is_run_threshold_reached(setting, threshold_seconds):
last_time = DateTimeField().to_internal_value(setting) if setting else None
if not last_time:
return True
else:
return (now() - last_time).total_seconds() > threshold_seconds
75 changes: 66 additions & 9 deletions awx/main/tasks/host_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,90 @@
import logging

from django.conf import settings
from django.db.models import Count
from django.db.models import Count, F
from django.db.models.functions import TruncMonth
from django.utils.timezone import now
from rest_framework.fields import DateTimeField
from awx.main.dispatch import get_task_queuename
from awx.main.dispatch.publish import task
from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly
from awx.main.tasks.helpers import is_run_threshold_reached
from awx.conf.license import get_license

logger = logging.getLogger('awx.main.tasks.host_metric_summary_monthly')
logger = logging.getLogger('awx.main.tasks.host_metrics')


@task(queue=get_task_queuename)
def cleanup_host_metrics():
if is_run_threshold_reached(getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', None), getattr(settings, 'CLEANUP_HOST_METRICS_INTERVAL', 30) * 86400):
logger.info(f"Executing cleanup_host_metrics, last ran at {getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', '---')}")
HostMetricTask().cleanup(
soft_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12),
hard_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36),
)
logger.info("Finished cleanup_host_metrics")


@task(queue=get_task_queuename)
def host_metric_summary_monthly():
"""Run cleanup host metrics summary monthly task each week"""
if _is_run_threshold_reached(
getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', None), getattr(settings, 'HOST_METRIC_SUMMARY_TASK_INTERVAL', 7) * 86400
):
if is_run_threshold_reached(getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', None), getattr(settings, 'HOST_METRIC_SUMMARY_TASK_INTERVAL', 7) * 86400):
logger.info(f"Executing host_metric_summary_monthly, last ran at {getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', '---')}")
HostMetricSummaryMonthlyTask().execute()
logger.info("Finished host_metric_summary_monthly")


def _is_run_threshold_reached(setting, threshold_seconds):
last_time = DateTimeField().to_internal_value(setting) if setting else DateTimeField().to_internal_value('1970-01-01')
class HostMetricTask:
"""
This class provides cleanup task for HostMetric model.
There are two modes:
- soft cleanup (updates columns delete, deleted_counter and last_deleted)
- hard cleanup (deletes from the db)
"""

def cleanup(self, soft_threshold=None, hard_threshold=None):
"""
Main entrypoint, runs either soft cleanup, hard cleanup or both

:param soft_threshold: (int)
:param hard_threshold: (int)
"""
if hard_threshold is not None:
self.hard_cleanup(hard_threshold)
if soft_threshold is not None:
self.soft_cleanup(soft_threshold)

settings.CLEANUP_HOST_METRICS_LAST_TS = now()

@staticmethod
def soft_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)

try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("soft_threshold has to be convertible to number") from e

last_automation_before = now() - relativedelta(months=threshold)
rows = HostMetric.active_objects.filter(last_automation__lt=last_automation_before).update(
deleted=True, deleted_counter=F('deleted_counter') + 1, last_deleted=now()
)
slemrmartin marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f'cleanup_host_metrics: soft-deleted records last automated before {last_automation_before}, affected rows: {rows}')

@staticmethod
def hard_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36)

try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("hard_threshold has to be convertible to number") from e

return (now() - last_time).total_seconds() > threshold_seconds
last_deleted_before = now() - relativedelta(months=threshold)
queryset = HostMetric.objects.filter(deleted=True, last_deleted__lt=last_deleted_before)
rows = queryset.delete()
logger.info(f'cleanup_host_metrics: hard-deleted records which were soft deleted before {last_deleted_before}, affected rows: {rows[0]}')


class HostMetricSummaryMonthlyTask:
Expand Down
29 changes: 2 additions & 27 deletions awx/main/tasks/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
Inventory,
SmartInventoryMembership,
Job,
HostMetric,
convert_jsonfields,
)
from awx.main.constants import ACTIVE_STATES
Expand All @@ -64,6 +63,7 @@

from awx.main.utils.reload import stop_local_services
from awx.main.utils.pglock import advisory_lock
from awx.main.tasks.helpers import is_run_threshold_reached
from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper, write_receptor_config
from awx.main.consumers import emit_channel_notification
from awx.main import analytics
Expand Down Expand Up @@ -368,9 +368,7 @@ def send_notifications(notification_list, job_id=None):

@task(queue=get_task_queuename)
def gather_analytics():
from awx.conf.models import Setting

if is_run_threshold_reached(Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first(), settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
if is_run_threshold_reached(getattr(settings, 'AUTOMATION_ANALYTICS_LAST_GATHER', None), settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
analytics.gather()


Expand Down Expand Up @@ -427,29 +425,6 @@ def cleanup_images_and_files():
_cleanup_images_and_files()


@task(queue=get_task_queuename)
def cleanup_host_metrics():
"""Run cleanup host metrics ~each month"""
# TODO: move whole method to host_metrics in follow-up PR
from awx.conf.models import Setting

if is_run_threshold_reached(
Setting.objects.filter(key='CLEANUP_HOST_METRICS_LAST_TS').first(), getattr(settings, 'CLEANUP_HOST_METRICS_INTERVAL', 30) * 86400
):
months_ago = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)
logger.info("Executing cleanup_host_metrics")
HostMetric.cleanup_task(months_ago)
slemrmartin marked this conversation as resolved.
Show resolved Hide resolved
logger.info("Finished cleanup_host_metrics")


def is_run_threshold_reached(setting, threshold_seconds):
from rest_framework.fields import DateTimeField

last_time = DateTimeField().to_internal_value(setting.value) if setting and setting.value else DateTimeField().to_internal_value('1970-01-01')

return (now() - last_time).total_seconds() > threshold_seconds


@task(queue=get_task_queuename)
def cluster_node_health_check(node):
"""
Expand Down
78 changes: 78 additions & 0 deletions awx/main/tests/functional/commands/test_cleanup_host_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

from awx.main.tasks.host_metrics import HostMetricTask
from awx.main.models.inventory import HostMetric
from awx.main.tests.factories.fixtures import mk_host_metric
from dateutil.relativedelta import relativedelta
from django.conf import settings
from django.utils import timezone


@pytest.mark.django_db
def test_no_host_metrics():
"""No-crash test"""
assert HostMetric.objects.count() == 0
HostMetricTask().cleanup(soft_threshold=0, hard_threshold=0)
HostMetricTask().cleanup(soft_threshold=24, hard_threshold=42)
assert HostMetric.objects.count() == 0


@pytest.mark.django_db
def test_delete_exception():
"""Crash test"""
with pytest.raises(ValueError):
HostMetricTask().soft_cleanup("")
with pytest.raises(TypeError):
HostMetricTask().hard_cleanup(set())


@pytest.mark.django_db
@pytest.mark.parametrize('threshold', [settings.CLEANUP_HOST_METRICS_SOFT_THRESHOLD, 20])
def test_soft_delete(threshold):
"""Metrics with last_automation < threshold are updated to deleted=True"""
mk_host_metric('host_1', first_automation=ago(months=1), last_automation=ago(months=1), deleted=False)
mk_host_metric('host_2', first_automation=ago(months=1), last_automation=ago(months=1), deleted=True)
mk_host_metric('host_3', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=-1), deleted=False)
mk_host_metric('host_4', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=-1), deleted=True)
mk_host_metric('host_5', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=1), deleted=False)
mk_host_metric('host_6', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=1), deleted=True)
mk_host_metric('host_7', first_automation=ago(months=1), last_automation=ago(months=42), deleted=False)
mk_host_metric('host_8', first_automation=ago(months=1), last_automation=ago(months=42), deleted=True)

assert HostMetric.objects.count() == 8
assert HostMetric.active_objects.count() == 4

for i in range(2):
HostMetricTask().cleanup(soft_threshold=threshold)
assert HostMetric.objects.count() == 8

hostnames = set(HostMetric.objects.filter(deleted=False).order_by('hostname').values_list('hostname', flat=True))
assert hostnames == {'host_1', 'host_3'}


@pytest.mark.django_db
@pytest.mark.parametrize('threshold', [settings.CLEANUP_HOST_METRICS_HARD_THRESHOLD, 20])
def test_hard_delete(threshold):
"""Metrics with last_deleted < threshold and deleted=True are deleted from the db"""
mk_host_metric('host_1', first_automation=ago(months=1), last_deleted=ago(months=1), deleted=False)
mk_host_metric('host_2', first_automation=ago(months=1), last_deleted=ago(months=1), deleted=True)
mk_host_metric('host_3', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=-1), deleted=False)
mk_host_metric('host_4', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=-1), deleted=True)
mk_host_metric('host_5', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=1), deleted=False)
mk_host_metric('host_6', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=1), deleted=True)
mk_host_metric('host_7', first_automation=ago(months=1), last_deleted=ago(months=42), deleted=False)
mk_host_metric('host_8', first_automation=ago(months=1), last_deleted=ago(months=42), deleted=True)

assert HostMetric.objects.count() == 8
assert HostMetric.active_objects.count() == 4

for i in range(2):
HostMetricTask().cleanup(hard_threshold=threshold)
assert HostMetric.objects.count() == 6

hostnames = set(HostMetric.objects.order_by('hostname').values_list('hostname', flat=True))
assert hostnames == {'host_1', 'host_2', 'host_3', 'host_4', 'host_5', 'host_7'}


def ago(months=0, hours=0):
return timezone.now() - relativedelta(months=months, hours=hours)
4 changes: 2 additions & 2 deletions awx/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@
'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)},
'cleanup_images': {'task': 'awx.main.tasks.system.cleanup_images_and_files', 'schedule': timedelta(hours=3)},
'cleanup_host_metrics': {'task': 'awx.main.tasks.system.cleanup_host_metrics', 'schedule': timedelta(hours=3, minutes=30)},
'cleanup_host_metrics': {'task': 'awx.main.tasks.host_metrics.cleanup_host_metrics', 'schedule': timedelta(hours=3, minutes=30)},
'host_metric_summary_monthly': {'task': 'awx.main.tasks.host_metrics.host_metric_summary_monthly', 'schedule': timedelta(hours=4)},
}

Expand Down Expand Up @@ -1049,7 +1049,7 @@
# - 'unique_managed_hosts': Compliant = automated - deleted hosts (using /api/v2/host_metrics/)
SUBSCRIPTION_USAGE_MODEL = ''

# Host metrics cleanup - last time of the cleanup run (soft-deleting records)
# Host metrics cleanup - last time of the task/command run
CLEANUP_HOST_METRICS_LAST_TS = None
# Host metrics cleanup - minimal interval between two cleanups in days
CLEANUP_HOST_METRICS_INTERVAL = 30 # days
Expand Down
Loading