Skip to content

Commit

Permalink
Merge pull request #79 from naved001/refactor/invoice_class_2
Browse files Browse the repository at this point in the history
Some updates to the pod class
  • Loading branch information
naved001 authored Oct 16, 2024
2 parents c9768a8 + 5419778 commit c86396e
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 182 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.11'

- name: Install dependencies
run: |
Expand Down
73 changes: 54 additions & 19 deletions openshift_metrics/invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import namedtuple
from typing import List
from decimal import Decimal, ROUND_HALF_UP
import datetime

# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
Expand All @@ -28,9 +29,11 @@

ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])


@dataclass
class Pod:
"""Object that represents a pod"""

pod_name: str
namespace: str
start_time: int
Expand All @@ -43,20 +46,19 @@ class Pod:
node_hostname: str
node_model: str

@staticmethod
def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource) -> ServiceUnit:
def get_service_unit(self) -> ServiceUnit:
"""
Returns the type of service unit, the count, and the determining resource
"""
su_type = SU_UNKNOWN
su_count = 0

# pods that requested a specific GPU but weren't scheduled may report 0 GPU
if gpu_resource is not None and gpu_count == 0:
if self.gpu_resource is not None and self.gpu_request == 0:
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")

# pods in weird states
if cpu_count == 0 or memory_count == 0:
if self.cpu_request == 0 or self.memory_request == 0:
return ServiceUnit(SU_UNKNOWN, 0, "CPU")

known_gpu_su = {
Expand All @@ -82,18 +84,18 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource)
SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
}

if gpu_resource is None and gpu_count == 0:
if self.gpu_resource is None and self.gpu_request == 0:
su_type = SU_CPU
elif gpu_type is not None and gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)
elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU)
elif self.gpu_type is not None and self.gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(self.gpu_type, SU_UNKNOWN_GPU)
elif self.gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(self.gpu_resource, SU_UNKNOWN_MIG_GPU)
else:
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
memory_multiplier = memory_count / su_config[su_type]["ram"]
cpu_multiplier = self.cpu_request / su_config[su_type]["cpu"]
gpu_multiplier = self.gpu_request / su_config[su_type]["gpu"]
memory_multiplier = self.memory_request / su_config[su_type]["ram"]

su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

Expand All @@ -114,6 +116,45 @@ def get_runtime(self) -> Decimal:
"""Return runtime eligible for billing in hours"""
return Decimal(self.duration) / 3600

@property
def end_time(self) -> int:
return self.start_time + self.duration

def generate_pod_row(self):
"""
This returns a row to represent pod data.
It converts the epoch_time stamps to datetime timestamps so it's more readable.
Additionally, some metrics are rounded for readibility.
"""
su_type, su_count, determining_resource = self.get_service_unit()
start_time = datetime.datetime.fromtimestamp(
self.start_time, datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S")
end_time = datetime.datetime.fromtimestamp(
self.end_time, datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S")
memory_request = self.memory_request.quantize(
Decimal(".0001"), rounding=ROUND_HALF_UP
)
runtime = self.get_runtime().quantize(Decimal(".0001"), rounding=ROUND_HALF_UP)
return [
self.namespace,
start_time,
end_time,
runtime,
self.pod_name,
self.cpu_request,
self.gpu_request,
self.gpu_type,
self.gpu_resource,
self.node_hostname,
self.node_model,
memory_request,
determining_resource,
su_type,
su_count,
]


@dataclass()
class Rates:
Expand Down Expand Up @@ -150,13 +191,7 @@ class ProjectInvoce:

def add_pod(self, pod: Pod) -> None:
"""Aggregate a pods data"""
su_type, su_count, _ = Pod.get_service_unit(
cpu_count=pod.cpu_request,
memory_count=pod.memory_request,
gpu_count=pod.gpu_request,
gpu_type=pod.gpu_type,
gpu_resource=pod.gpu_resource,
)
su_type, su_count, _ = pod.get_service_unit()
duration_in_hours = pod.get_runtime()
self.su_hours[su_type] += su_count * duration_in_hours

Expand Down
22 changes: 11 additions & 11 deletions openshift_metrics/tests/test_metrics_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from unittest import TestCase
from openshift_metrics import metrics_processor, utils
from openshift_metrics import metrics_processor, invoice


class TestMergeMetrics(TestCase):
Expand Down Expand Up @@ -490,43 +490,43 @@ def test_condense_metrics_with_changing_gpu(self):
"cpu": 1,
"mem": 4,
"gpu_request": 1,
"gpu_type": utils.GPU_V100,
"gpu_type": invoice.GPU_V100,
},
2700: {
"cpu": 1,
"mem": 4,
"gpu_request": 1,
"gpu_type": utils.GPU_V100,
"gpu_type": invoice.GPU_V100,
},
3600: { # type of GPU is changed
"cpu": 1,
"mem": 4,
"gpu_request": 1,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
4500: {
"cpu": 1,
"mem": 4,
"gpu_request": 1,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
5400: {
"cpu": 1,
"mem": 4,
"gpu_request": 1,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
6300: { # count of GPU is changed
"cpu": 1,
"mem": 4,
"gpu_request": 3,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
7200: {
"cpu": 1,
"mem": 4,
"gpu_request": 3,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
8100: { # no longer using GPUs
"cpu": 1,
Expand All @@ -546,21 +546,21 @@ def test_condense_metrics_with_changing_gpu(self):
"mem": 4,
"duration": 1800,
"gpu_request": 1,
"gpu_type": utils.GPU_V100,
"gpu_type": invoice.GPU_V100,
},
3600: {
"cpu": 1,
"mem": 4,
"duration": 2700,
"gpu_request": 1,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
6300: {
"cpu": 1,
"mem": 4,
"duration": 1800,
"gpu_request": 3,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_type": invoice.GPU_A100_SXM4,
},
8100: {
"cpu": 1,
Expand Down
Loading

0 comments on commit c86396e

Please sign in to comment.