diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index f2156c6..f76655e 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -14,7 +14,7 @@ jobs: - name: Set up python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - name: Install dependencies run: | diff --git a/openshift_metrics/invoice.py b/openshift_metrics/invoice.py index 52a2315..34de449 100644 --- a/openshift_metrics/invoice.py +++ b/openshift_metrics/invoice.py @@ -3,6 +3,7 @@ from collections import namedtuple from typing import List from decimal import Decimal, ROUND_HALF_UP +import datetime # GPU types GPU_A100 = "NVIDIA-A100-40GB" @@ -28,9 +29,11 @@ ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"]) + @dataclass class Pod: """Object that represents a pod""" + pod_name: str namespace: str start_time: int @@ -43,8 +46,7 @@ class Pod: node_hostname: str node_model: str - @staticmethod - def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource) -> ServiceUnit: + def get_service_unit(self) -> ServiceUnit: """ Returns the type of service unit, the count, and the determining resource """ @@ -52,11 +54,11 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource) su_count = 0 # pods that requested a specific GPU but weren't scheduled may report 0 GPU - if gpu_resource is not None and gpu_count == 0: + if self.gpu_resource is not None and self.gpu_request == 0: return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU") # pods in weird states - if cpu_count == 0 or memory_count == 0: + if self.cpu_request == 0 or self.memory_request == 0: return ServiceUnit(SU_UNKNOWN, 0, "CPU") known_gpu_su = { @@ -82,18 +84,18 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource) SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1}, } - if gpu_resource is None and gpu_count == 0: + if self.gpu_resource is None and self.gpu_request == 0: su_type = SU_CPU - elif gpu_type is not None and gpu_resource == WHOLE_GPU: - su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU) - elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4 - su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU) + elif self.gpu_type is not None and self.gpu_resource == WHOLE_GPU: + su_type = known_gpu_su.get(self.gpu_type, SU_UNKNOWN_GPU) + elif self.gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4 + su_type = A100_SXM4_MIG.get(self.gpu_resource, SU_UNKNOWN_MIG_GPU) else: return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU") - cpu_multiplier = cpu_count / su_config[su_type]["cpu"] - gpu_multiplier = gpu_count / su_config[su_type]["gpu"] - memory_multiplier = memory_count / su_config[su_type]["ram"] + cpu_multiplier = self.cpu_request / su_config[su_type]["cpu"] + gpu_multiplier = self.gpu_request / su_config[su_type]["gpu"] + memory_multiplier = self.memory_request / su_config[su_type]["ram"] su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier) @@ -114,6 +116,45 @@ def get_runtime(self) -> Decimal: """Return runtime eligible for billing in hours""" return Decimal(self.duration) / 3600 + @property + def end_time(self) -> int: + return self.start_time + self.duration + + def generate_pod_row(self): + """ + This returns a row to represent pod data. + It converts the epoch_time stamps to datetime timestamps so it's more readable. + Additionally, some metrics are rounded for readibility. + """ + su_type, su_count, determining_resource = self.get_service_unit() + start_time = datetime.datetime.fromtimestamp( + self.start_time, datetime.UTC + ).strftime("%Y-%m-%dT%H:%M:%S") + end_time = datetime.datetime.fromtimestamp( + self.end_time, datetime.UTC + ).strftime("%Y-%m-%dT%H:%M:%S") + memory_request = self.memory_request.quantize( + Decimal(".0001"), rounding=ROUND_HALF_UP + ) + runtime = self.get_runtime().quantize(Decimal(".0001"), rounding=ROUND_HALF_UP) + return [ + self.namespace, + start_time, + end_time, + runtime, + self.pod_name, + self.cpu_request, + self.gpu_request, + self.gpu_type, + self.gpu_resource, + self.node_hostname, + self.node_model, + memory_request, + determining_resource, + su_type, + su_count, + ] + @dataclass() class Rates: @@ -150,13 +191,7 @@ class ProjectInvoce: def add_pod(self, pod: Pod) -> None: """Aggregate a pods data""" - su_type, su_count, _ = Pod.get_service_unit( - cpu_count=pod.cpu_request, - memory_count=pod.memory_request, - gpu_count=pod.gpu_request, - gpu_type=pod.gpu_type, - gpu_resource=pod.gpu_resource, - ) + su_type, su_count, _ = pod.get_service_unit() duration_in_hours = pod.get_runtime() self.su_hours[su_type] += su_count * duration_in_hours diff --git a/openshift_metrics/tests/test_metrics_processor.py b/openshift_metrics/tests/test_metrics_processor.py index 72141af..1c9e614 100644 --- a/openshift_metrics/tests/test_metrics_processor.py +++ b/openshift_metrics/tests/test_metrics_processor.py @@ -1,5 +1,5 @@ from unittest import TestCase -from openshift_metrics import metrics_processor, utils +from openshift_metrics import metrics_processor, invoice class TestMergeMetrics(TestCase): @@ -490,43 +490,43 @@ def test_condense_metrics_with_changing_gpu(self): "cpu": 1, "mem": 4, "gpu_request": 1, - "gpu_type": utils.GPU_V100, + "gpu_type": invoice.GPU_V100, }, 2700: { "cpu": 1, "mem": 4, "gpu_request": 1, - "gpu_type": utils.GPU_V100, + "gpu_type": invoice.GPU_V100, }, 3600: { # type of GPU is changed "cpu": 1, "mem": 4, "gpu_request": 1, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 4500: { "cpu": 1, "mem": 4, "gpu_request": 1, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 5400: { "cpu": 1, "mem": 4, "gpu_request": 1, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 6300: { # count of GPU is changed "cpu": 1, "mem": 4, "gpu_request": 3, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 7200: { "cpu": 1, "mem": 4, "gpu_request": 3, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 8100: { # no longer using GPUs "cpu": 1, @@ -546,21 +546,21 @@ def test_condense_metrics_with_changing_gpu(self): "mem": 4, "duration": 1800, "gpu_request": 1, - "gpu_type": utils.GPU_V100, + "gpu_type": invoice.GPU_V100, }, 3600: { "cpu": 1, "mem": 4, "duration": 2700, "gpu_request": 1, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 6300: { "cpu": 1, "mem": 4, "duration": 1800, "gpu_request": 3, - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, }, 8100: { "cpu": 1, diff --git a/openshift_metrics/tests/test_utils.py b/openshift_metrics/tests/test_utils.py index a98977e..3379c8a 100644 --- a/openshift_metrics/tests/test_utils.py +++ b/openshift_metrics/tests/test_utils.py @@ -73,16 +73,6 @@ class TestWriteMetricsByPod(TestCase): @mock.patch('openshift_metrics.utils.get_namespace_attributes') def test_write_metrics_log(self, mock_gna): - mock_gna.return_value = { - 'namespace1': { - 'cf_pi': 'PI1', - 'cf_project_id': '123', - }, - 'namespace2': { - 'cf_pi': 'PI2', - 'cf_project_id': '456', - } - } test_metrics_dict = { "namespace1": { "pod1": { @@ -145,14 +135,14 @@ def test_write_metrics_log(self, mock_gna): } } - expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,GPU Resource,Node,Node Model,Memory Request (GiB),Determining Resource,SU Type,SU Count\n" - "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,,,wrk-1,Dell,0.0010,CPU,OpenShift CPU,10\n" - "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,,,wrk-2,Lenovo,0.0010,CPU,OpenShift CPU,20\n" - "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20\n" - "namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,25\n" - "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20\n" - "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.0500,pod3,45,0,,,Unknown Node,Unknown Model,0.0977,CPU,OpenShift CPU,45\n" - "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0000,pod4,0.5,0,,,Unknown Node,Unknown Model,2.0000,CPU,OpenShift CPU,0.5\n") + expected_output = ("Namespace,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,GPU Resource,Node,Node Model,Memory Request (GiB),Determining Resource,SU Type,SU Count\n" + "namespace1,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,,,wrk-1,Dell,0.0010,CPU,OpenShift CPU,10\n" + "namespace1,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,,,wrk-2,Lenovo,0.0010,CPU,OpenShift CPU,20\n" + "namespace1,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20\n" + "namespace1,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,25\n" + "namespace1,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20\n" + "namespace2,1970-01-01T00:00:00,1970-01-01T00:03:00,0.0500,pod3,45,0,,,Unknown Node,Unknown Model,0.0977,CPU,OpenShift CPU,45\n" + "namespace2,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0000,pod4,0.5,0,,,Unknown Node,Unknown Model,2.0000,CPU,OpenShift CPU,0.5\n") with tempfile.NamedTemporaryFile(mode="w+") as tmp: utils.write_metrics_by_pod(test_metrics_dict, tmp.name) @@ -221,21 +211,21 @@ def test_write_metrics_log(self, mock_gna): "cpu_request": 1, "memory_request": 8 * 2**30, "gpu_request": 1, - "gpu_type": utils.GPU_A100, - "gpu_resource": utils.WHOLE_GPU, + "gpu_type": invoice.GPU_A100, + "gpu_resource": invoice.WHOLE_GPU, "duration": 172700 # little under 48 hours, expect to be rounded up in the output }, } }, "pod5": { - "gpu_type": utils.GPU_A100_SXM4, + "gpu_type": invoice.GPU_A100_SXM4, "metrics": { 0: { "cpu_request": 24, "memory_request": 8 * 2**30, "gpu_request": 1, - "gpu_type": utils.GPU_A100_SXM4, - "gpu_resource": utils.WHOLE_GPU, + "gpu_type": invoice.GPU_A100_SXM4, + "gpu_resource": invoice.WHOLE_GPU, "duration": 172800 }, } @@ -301,118 +291,160 @@ def test_write_metrics_by_namespace_decimal(self, mock_gna): class TestGetServiceUnit(TestCase): + def make_pod( + self, + cpu_request, + memory_request, + gpu_request, + gpu_type, + gpu_resource + ): + + return invoice.Pod( + pod_name="pod1", + namespace="namespace1", + start_time=600, + duration=3600, + cpu_request=cpu_request, + gpu_request=gpu_request, + memory_request=memory_request, + gpu_type=gpu_type, + gpu_resource=gpu_resource, + node_hostname="node-1", + node_model="model-1" + ) + def test_cpu_only(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(4, 16, 0, None, None) - self.assertEqual(su_type, utils.SU_CPU) + pod = self.make_pod(4, 16, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_CPU) self.assertEqual(su_count, 4) self.assertEqual(determining_resource, "CPU") def test_known_gpu(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_GPU) + pod = self.make_pod(24, 74, 1, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_A100_SXM4(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_SXM4_GPU) + pod = self.make_pod(32, 245, 1, invoice.GPU_A100_SXM4, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_SXM4_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_high_cpu(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_GPU) + pod = self.make_pod(50, 96, 1, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_GPU) self.assertEqual(su_count, 3) self.assertEqual(determining_resource, "CPU") def test_known_gpu_high_memory(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_GPU) + pod = self.make_pod(24, 100, 1, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_GPU) self.assertEqual(su_count, 2) self.assertEqual(determining_resource, "RAM") def test_known_gpu_low_cpu_memory(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_GPU) + pod = self.make_pod(2, 4, 1, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_unknown_gpu(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + pod = self.make_pod(8, 64, 1, "Unknown_GPU_Type", invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_zero_count(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + pod = self.make_pod(8, 64, 0, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN_GPU) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "GPU") def test_known_mig_gpu(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB) - self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU) + pod = self.make_pod(1, 4, 1, invoice.GPU_A100_SXM4, invoice.MIG_1G_5GB) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN_MIG_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_unknown_resource(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB") - self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + pod = self.make_pod(1, 4, 1, invoice.GPU_A100, "nvidia.com/mig_20G_500GB") + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN_GPU) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "GPU") def test_unknown_gpu_known_resource(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB) - self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + pod = self.make_pod(1, 4, 1, "Unknown GPU", invoice.MIG_2G_10GB) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN_GPU) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "GPU") def test_zero_memory(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 0, 0, None, None) - self.assertEqual(su_type, utils.SU_UNKNOWN) + pod = self.make_pod(1, 0, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "CPU") def test_zero_cpu(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0, 1, 0, None, None) - self.assertEqual(su_type, utils.SU_UNKNOWN) + pod = self.make_pod(0, 1, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_UNKNOWN) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "CPU") def test_memory_dominant(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, None, None) - self.assertEqual(su_type, utils.SU_CPU) + pod = self.make_pod(8, 64, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_CPU) self.assertEqual(su_count, 16) self.assertEqual(determining_resource, "RAM") def test_fractional_su_cpu_dominant(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.5, 0.5, 0, None, None) - self.assertEqual(su_type, utils.SU_CPU) + pod = self.make_pod(0.5, 0.5, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_CPU) self.assertEqual(su_count, 0.5) self.assertEqual(determining_resource, "CPU") def test_fractional_su_memory_dominant(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.1, 1, 0, None, None) - self.assertEqual(su_type, utils.SU_CPU) + pod = self.make_pod(0.1, 1, 0, None, None) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_CPU) self.assertEqual(su_count, 0.25) self.assertEqual(determining_resource, "RAM") def test_known_gpu_fractional_cpu_memory(self): - su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU) - self.assertEqual(su_type, utils.SU_A100_GPU) + pod = self.make_pod(0.8, 0.8, 1, invoice.GPU_A100, invoice.WHOLE_GPU) + su_type, su_count, determining_resource = pod.get_service_unit() + self.assertEqual(su_type, invoice.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_decimal_return_type(self): from decimal import Decimal - _, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None) + pod = self.make_pod(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None) + _, su_count, _ = pod.get_service_unit() self.assertIsInstance(su_count, Decimal) self.assertEqual(su_count, Decimal('2.025')) def test_not_decimal_return_type_when_gpu_su_type(self): from decimal import Decimal - su_type, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("76"), Decimal("1"), utils.GPU_A100, utils.WHOLE_GPU) + pod = self.make_pod(Decimal("1"), Decimal("76"), Decimal("1"), invoice.GPU_A100, invoice.WHOLE_GPU) # for GPU SUs, we always round up to the nearest integer + su_type, su_count, _ = pod.get_service_unit() self.assertIsInstance(su_count, int) self.assertEqual(su_count, 2) - self.assertEqual(su_type, utils.SU_A100_GPU) + self.assertEqual(su_type, invoice.SU_A100_GPU) diff --git a/openshift_metrics/utils.py b/openshift_metrics/utils.py index 02689a4..1ea9012 100755 --- a/openshift_metrics/utils.py +++ b/openshift_metrics/utils.py @@ -14,51 +14,12 @@ """Holds bunch of utility functions""" import os -import datetime -import time -import math import csv import requests import boto3 from openshift_metrics import invoice from decimal import Decimal -import decimal -from urllib3.util.retry import Retry -from requests.adapters import HTTPAdapter - -# GPU types -GPU_A100 = "NVIDIA-A100-40GB" -GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB" -GPU_V100 = "Tesla-V100-PCIE-32GB" -GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE" - -# GPU Resource - MIG Geometries -# A100 Strategies -MIG_1G_5GB = "nvidia.com/mig-1g.5gb" -MIG_2G_10GB = "nvidia.com/mig-2g.10gb" -MIG_3G_20GB = "nvidia.com/mig-3g.20gb" -WHOLE_GPU = "nvidia.com/gpu" - - -# SU Types -SU_CPU = "OpenShift CPU" -SU_A100_GPU = "OpenShift GPUA100" -SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4" -SU_V100_GPU = "OpenShift GPUV100" -SU_UNKNOWN_GPU = "OpenShift Unknown GPU" -SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU" -SU_UNKNOWN = "Openshift Unknown" - -RATE = { - SU_CPU: Decimal("0.013"), - SU_A100_GPU: Decimal("1.803"), - SU_A100_SXM4_GPU: Decimal("2.078"), - SU_V100_GPU: Decimal("1.214"), - SU_UNKNOWN_GPU: Decimal("0"), -} - -STEP_MIN = 15 class EmptyResultError(Exception): @@ -225,19 +186,13 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month): csv_writer(rows, file_name) -def write_metrics_by_pod(metrics_dict, file_name): +def write_metrics_by_pod(condensed_metrics_dict, file_name): """ - Generates metrics report by pod - - It currently includes service units for each pod, but that doesn't make sense - as we are calculating the CPU/Memory service units at the project level + Generates metrics report by pod. """ rows = [] - namespace_annotations = get_namespace_attributes() headers = [ "Namespace", - "Coldfront_PI Name", - "Coldfront Project ID ", "Pod Start Time", "Pod End Time", "Duration (Hours)", @@ -255,52 +210,23 @@ def write_metrics_by_pod(metrics_dict, file_name): ] rows.append(headers) - for namespace, pods in metrics_dict.items(): - for pod, pod_dict in pods.items(): + for namespace, pods in condensed_metrics_dict.items(): + for pod_name, pod_dict in pods.items(): pod_metrics_dict = pod_dict["metrics"] - namespace_annotation_dict = namespace_annotations.get(namespace, {}) - cf_pi = namespace_annotation_dict.get("cf_pi") - cf_project_id = namespace_annotation_dict.get("cf_project_id") - for epoch_time, pod_metric_dict in pod_metrics_dict.items(): - start_time = datetime.datetime.utcfromtimestamp(float(epoch_time)).strftime( - "%Y-%m-%dT%H:%M:%S" - ) - end_time = datetime.datetime.utcfromtimestamp( - float(epoch_time + pod_metric_dict["duration"]) - ).strftime("%Y-%m-%dT%H:%M:%S") - duration = (Decimal(pod_metric_dict["duration"]) / 3600).quantize(Decimal(".0001"), rounding=decimal.ROUND_HALF_UP) - cpu_request = Decimal(pod_metric_dict.get("cpu_request", 0)) - gpu_request = Decimal(pod_metric_dict.get("gpu_request", 0)) - gpu_type = pod_metric_dict.get("gpu_type") - gpu_resource = pod_metric_dict.get("gpu_resource") - node = pod_metric_dict.get("node", "Unknown Node") - node_model = pod_metric_dict.get("node_model", "Unknown Model") - memory_request = (Decimal(pod_metric_dict.get("memory_request", 0)) / 2**30).quantize(Decimal(".0001"), rounding=decimal.ROUND_HALF_UP) - su_type, su_count, determining_resource = invoice.Pod.get_service_unit( - cpu_request, memory_request, gpu_request, gpu_type, gpu_resource + pod_obj = invoice.Pod( + pod_name=pod_name, + namespace=namespace, + start_time=epoch_time, + duration=pod_metric_dict["duration"], + cpu_request=Decimal(pod_metric_dict.get("cpu_request", 0)), + gpu_request=Decimal(pod_metric_dict.get("gpu_request", 0)), + memory_request=Decimal(pod_metric_dict.get("memory_request", 0)) / 2**30, + gpu_type=pod_metric_dict.get("gpu_type"), + gpu_resource=pod_metric_dict.get("gpu_resource"), + node_hostname=pod_metric_dict.get("node", "Unknown Node"), + node_model=pod_metric_dict.get("node_model", "Unknown Model"), ) - - info_list = [ - namespace, - cf_pi, - cf_project_id, - start_time, - end_time, - duration, - pod, - cpu_request, - gpu_request, - gpu_type, - gpu_resource, - node, - node_model, - memory_request, - determining_resource, - su_type, - su_count, - ] - - rows.append(info_list) + rows.append(pod_obj.generate_pod_row()) csv_writer(rows, file_name)