Skip to content

Commit

Permalink
Add classes to parse ESMF profiling data.
Browse files Browse the repository at this point in the history
  • Loading branch information
micaeljtoliveira committed Apr 9, 2024
1 parent 7f6d84c commit 191c370
Show file tree
Hide file tree
Showing 6 changed files with 513 additions and 0 deletions.
42 changes: 42 additions & 0 deletions om3utils/esmf_profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pathlib import Path

from om3utils.profiling import ProfilingParser
from om3utils.esmf_trace import ESMFTrace, MultiPETTimingNode


class ESMFProfilingParser(ProfilingParser):
def __init__(self, dirname):
# ESMF provides the following metrics:
super().__init__()
self._metrics = ["hits", "tmin", "tmax", "tavg", "ttot"]
self._dirname = dirname

@property
def metrics(self) -> list:
return self._metrics

def read(self, path: Path) -> dict:
profiling_dir = path / self._dirname
if not profiling_dir.is_dir():
raise FileNotFoundError(f"Directory not found: {profiling_dir.as_posix()}")

trace = ESMFTrace(profiling_dir)

stats = {"region": trace.regions}
stats.update({key: [0] * len(trace.regions) for key in self.metrics})

for name, child in trace.multiPETTree.children.items():
self._add_node_stats(child, name, stats)

return stats

def _add_node_stats(self, node: MultiPETTimingNode, name: str, stats: dict):
index = stats["region"].index(name)
stats["hits"][index] += node.count_each
stats["tmin"][index] += node.total_min_s
stats["tmax"][index] += node.total_max_s
stats["tavg"][index] += node.total_mean_s
stats["ttot"][index] += node.total_sum_s

for name, child in node.children.items():
self._add_node_stats(child, name, stats)
300 changes: 300 additions & 0 deletions om3utils/esmf_trace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
"""Classes to represent timing trees as produced by ESMF when running in profiling mode.
ESMF can output an execution trace using the Common Trace Format (CTF). Full CTF documentation can be found online at
https://diamon.org/ctf/, but in a nutshell, CFT is a very flexible binary trace format. A reference parser
implementation of CTF is provided by the Babeltrace 2 library (https://babeltrace.org/), which also includes Python 3
bindings.
Note that most of the contents of this file are based on the code from esmf-profiler (https://github.com/esmf-org/esmf-profiler).
"""

import sys
from pathlib import Path

import bt2

from om3utils.utils import nano_to_sec


class SinglePETTimingNode:
"""Class representing a single PET timing node in a tree of profiling events.
One node corresponds to one profiling region and one PET (Persistent Execution Thread) and it can have
child regions, which are store in a list. This allows to create a tree of regions.
Furthermore, the root node of a tree (id = 0) also stores a dictionary of all the regions that exist in that tree.
This allows to efficiently append a new node to the tree.
"""

def __init__(self, _id: int, pet: int, name: str):
"""
Args:
_id (int): Region ID.
pet (int): PET number.
name (str): Region name.
"""
self._id = _id
self._pet = pet
self._name = name
self._total = 0
self._min = sys.maxsize
self._max = 0
self._mean = 0
self._count = 0
self._children = [] # List of children that have this node as direct parent

# Cache containing all of this node's children, direct or indirect, that is, all the nodes that belong to the
# tree that has this node as root. Note that only the root node of the tree maintains a cache (self._id = 0)
self._child_cache = {} # id -> SinglePetTimingTreeNode
if self._id == 0:
self._child_cache[self._id] = self

@property
def name(self):
return self._name

@property
def pet(self):
return self._pet

@property
def total(self):
return self._total

@total.setter
def total(self, value):
self._total = value

@property
def count(self):
return self._count

@count.setter
def count(self, value):
self._count = value

@property
def min(self):
return self._min

@min.setter
def min(self, value):
self._min = value

@property
def max(self):
return self._max

@max.setter
def max(self, value):
self._max = value

@property
def mean(self):
return self._mean

@mean.setter
def mean(self, value):
self._mean = value

@property
def children(self):
return self._children

#
def add_child(self, parentid, child: "SinglePETTimingNode"):
"""Add child node to the node with given parentid.
Note that it must be called only from the root of the tree (self._id == 0).
Args:
parentid (int): Parent region ID.
child (SinglePETTimingNode): Child to add.
Returns:
bool: Execution status.
"""
self._child_cache[parentid]._children.append(child)
self._child_cache[child._id] = child
return True


class MultiPETTimingNode:
"""Class representing a multi-PET timing node in a tree of profiling events.
One node corresponds to one profiling region executed by one or more PETs.
"""

def __init__(self):
self._children: dict[str, MultiPETTimingNode] = (
{}
) # sub-regions in the timing tree { name -> MultiPETTimingNode }
self._pet_count = 0 # the number of PETs reporting timing information for this node
self._count_each = -1 # how many times each PET called into this region
self._counts_match = True # if counts_each is not the same for all reporting PETs, then this is False
self._total_sum = 0 # sum of all totals
self._total_min = sys.maxsize # min of all totals
self._total_min_pet = -1 # PET with min total
self._total_max = 0 # max of all totals
self._total_max_pet = -1 # PET with max total
self._contributing_nodes = {} # map of contributing SinglePETTimingNodes (key = PET)

@property
def pet_count(self):
"""int: Number of PETs reporting timing information for this node."""
return self._pet_count

@property
def count_each(self):
"""int: How many times each PET is called into this region.
Note that this value is only meaningful if self.counts_match is true.
"""
return self._count_each

@property
def counts_match(self):
"""bool: Is the value of counts_each the same for all reporting PETs?"""
return self._counts_match

@property
def total_sum(self):
"""float: Sum of all totals from all PETs, in nanoseconds."""
return self._total_sum

@property
def total_sum_s(self):
"""float: Sum of all totals from all PETs, in seconds."""
return nano_to_sec(self._total_sum)

@property
def total_mean(self):
"""float: The mean total time, averaged over all PETs, in nanoseconds."""
return self._total_sum / self._pet_count

@property
def total_mean_s(self):
"""float: The mean total time, averaged over all PETs, in seconds."""
return nano_to_sec(self.total_mean)

@property
def total_min(self):
"""float: Minimum total time spend in this region among all PETs, in nanoseconds."""
return self._total_min

@property
def total_min_s(self):
"""float: Minimum total time spend in this region among all PETs, in seconds."""
return nano_to_sec(self._total_min)

@property
def total_min_pet(self):
"""int: ID of PET who spent the minimum total time in this region."""
return self._total_min_pet

@property
def total_max(self):
"""float: Maximum total time spend in this region among all PETs, in nanoseconds."""
return self._total_max

@property
def total_max_s(self):
"""float: Maximum total time spend in this region among all PETs, in seconds."""
return nano_to_sec(self._total_max)

@property
def total_max_pet(self):
"""int: ID of PET who spent the maximum total time in this region."""
return self._total_max_pet

@property
def children(self):
"""dict[str, MultiPETTimingNode]: Direct children of this node."""
return self._children

def _merge_children(self, other: SinglePETTimingNode):
for c in other.children:
rs = self._children.setdefault(c.name, MultiPETTimingNode())
rs.merge(c)

def merge(self, other: SinglePETTimingNode):
"""Merge a single-PET tree into this multi-PET timing tree.
Update the statistics of this MultiPETTimingNode by including the information from a new SinglePETTiming node.
This function then proceeds to traverse the entire single-PET tree and merge all of its nodes. This is achieved
by calling the self._merge_children method.
Args:
other (SinglePETTimingNode): Single-PET tree to merge.
"""
self._pet_count += 1
if self._pet_count == 1:
self._count_each = other.count
elif self._count_each != other.count:
self._counts_match = False

self._total_sum += other.total
if self._total_min > other.min:
self._total_min = other.min
self._total_min_pet = other.pet
if self._total_max < other.max:
self._total_max = other.max
self._total_max_pet = other.pet

self._contributing_nodes[other.pet] = other

self._merge_children(other)


class ESMFTrace:
def __init__(self, path: Path):
self._region_id_to_name_map = {} # { pet -> { region_id -> region name } }
self._timing_trees = {} # { pet -> root of timing tree }
self._regions = {"TOP"} # Set containing all known regions. Using a set ensures region's names are unique.

# Iterate over the trace messages.
for msg in bt2.TraceCollectionMessageIterator(path.as_posix()):
if type(msg) is bt2._EventMessageConst:
self._handle_event(msg)

self.multiPETTree = MultiPETTimingNode()
for _, t in self._timing_trees.items():
self.multiPETTree.merge(t)

@property
def regions(self):
return list(self._regions)

def _handle_event(self, msg: bt2._EventMessageConst):
if msg.event.name == "define_region":
pet = int(msg.event.packet.context_field["pet"])
regionid = int(msg.event.payload_field["id"])
name = str(msg.event.payload_field["name"])
self._region_id_to_name_map.setdefault(pet, {})[regionid] = name
self._regions.add(name)

elif msg.event.name == "region_profile":
pet = int(msg.event.packet.context_field["pet"])
region_id = int(msg.event.payload_field["id"])
parent_id = int(msg.event.payload_field["parentid"])

if region_id == 1:
# special case for outermost timed region
name = "TOP"
else:
_map = self._region_id_to_name_map[pet]
name = _map[region_id] # should already be there

node = SinglePETTimingNode(region_id, pet, name)
node.total = int(msg.event.payload_field["total"])
node.count = int(msg.event.payload_field["count"])
node.min = int(msg.event.payload_field["min"])
node.max = int(msg.event.payload_field["max"])

# add child to the timing tree for the given pet
root = self._timing_trees.setdefault(pet, SinglePETTimingNode(0, pet, "TOP_LEVEL"))
if not root.add_child(parent_id, node):
raise RuntimeError(
f"{self.__class__.__name__} child not added to tree pet = {pet}, regionid = {region_id}, parentid = {parent_id}, name = {name}"
)
5 changes: 5 additions & 0 deletions om3utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,8 @@ def convert_to_string(value) -> str:
return "{:e}".format(value).replace("e", "D")
else:
return str(value)


def nano_to_sec(nanos):
"""Convert nanoseconds to seconds."""
return nanos / (1000 * 1000 * 1000)
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ dependencies = [
"xarray",
]

[external]
host-requires = [
"pkg:generic/libbabeltrace2",
]

[build-system]
requires = ["setuptools", "setuptools_scm[toml]"]
build-backend = "setuptools.build_meta"
Expand Down
10 changes: 10 additions & 0 deletions tests/test_esmf_profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pytest
from pathlib import Path

from om3utils.esmf_profiling import ESMFProfilingParser


def test_read_missing_esmf_profiling_dir():
with pytest.raises(FileNotFoundError):
parser = ESMFProfilingParser("benchmark")
parser.read(Path("garbage"))
Loading

0 comments on commit 191c370

Please sign in to comment.