From 744108dec788dbaac6170e539f44c4ece97dfeff Mon Sep 17 00:00:00 2001 From: JoongunPark <8554137+JoongunPark@users.noreply.github.com> Date: Thu, 14 Nov 2024 01:25:34 -0500 Subject: [PATCH] Update trace_linker to use external_id for finding GPU op's parent CPU op --- src/trace_link/trace_linker.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/trace_link/trace_linker.py b/src/trace_link/trace_linker.py index 71696ca8..bf0f8ba5 100644 --- a/src/trace_link/trace_linker.py +++ b/src/trace_link/trace_linker.py @@ -123,7 +123,6 @@ def load_sync_dependencies( ) if not success: logging.error("Failed to load Critical Path Graph") - return sync_dependencies raw_events = trace_analysis.t.get_raw_trace_for_one_rank(rank=rank)["traceEvents"] for edge in cp_graph.critical_path_edges_set: @@ -541,7 +540,7 @@ def map_host_to_device_ops( ]: """Map Chakra host operators to corresponding device operators.""" logging.debug("Mapping Charka host operators to corresponding device operators.") - cpu_ev_idx_to_gpu_ops_map = self.group_gpu_ops_by_cpu_launchers( + cpu_external_id_to_gpu_ops_map = self.group_gpu_ops_by_cpu_launchers( kineto_gpu_ops, kineto_correlation_cuda_runtime_map, sorted_kineto_cpu_ops, sorted_kineto_cpu_op_ts ) @@ -569,7 +568,7 @@ def map_host_to_device_ops( ) = self.link_ops( host_op, kineto_op, - cpu_ev_idx_to_gpu_ops_map, + cpu_external_id_to_gpu_ops_map, kineto_rf_id_to_device_op_map, kineto_external_id_to_kineto_op_map, ) @@ -593,7 +592,7 @@ def group_gpu_ops_by_cpu_launchers( """ Group GPU operators based on their corresponding CPU launchers. - This is determined by the 'ev_idx' which links GPU operators to their initiating CPU launcher events. + This is determined by the 'external_id' which links GPU operators to their initiating CPU launcher events. Args: kineto_gpu_ops (List[KinetoOperator]): List of Kineto GPU operators. @@ -607,9 +606,9 @@ def group_gpu_ops_by_cpu_launchers( Dict[int, List[KinetoOperator]]: Mapping from CPU launch event indices to GPU operators. Raises: - ValueError: If 'ev_idx' is missing for any GPU operator. + ValueError: If 'external_id' is missing for any GPU operator. """ - cpu_ev_idx_to_gpu_ops_map = {} + cpu_external_id_to_gpu_ops_map = {} for gpu_op in kineto_gpu_ops: parent_cpu_op = self.find_parent_cpu_op( gpu_op, kineto_correlation_cuda_runtime_map, sorted_kineto_cpu_ops, sorted_kineto_cpu_op_ts @@ -619,9 +618,9 @@ def group_gpu_ops_by_cpu_launchers( logging.warning(warning_msg) continue - if parent_cpu_op.ev_idx == "": + if parent_cpu_op.external_id == "": error_msg = ( - f"Missing 'ev_idx' for CPU operator {parent_cpu_op.name}. " + f"Missing 'external_id' for CPU operator {parent_cpu_op.name}. " f"Cannot link GPU op {gpu_op.name} to {parent_cpu_op.name}." ) logging.warning(error_msg) @@ -629,9 +628,9 @@ def group_gpu_ops_by_cpu_launchers( logging.debug(f"group_gpu_ops_by_cpu_launchers '{parent_cpu_op.name}' -> '{gpu_op.name}'") - cpu_ev_idx_to_gpu_ops_map.setdefault(parent_cpu_op.ev_idx, []).append(gpu_op) + cpu_external_id_to_gpu_ops_map.setdefault(parent_cpu_op.external_id, []).append(gpu_op) - return cpu_ev_idx_to_gpu_ops_map + return cpu_external_id_to_gpu_ops_map def find_parent_cpu_op( self, @@ -735,26 +734,29 @@ def find_closest_op( # After skipping 'nccl:coalesced', verify that the closest operation is on the same thread # as the GPU operation - if closest_op.tid == kineto_gpu_op.tid: + if closest_op.tid == kineto_gpu_op.tid and closest_op.external_id == kineto_gpu_op.external_id: return closest_op # If the tids do not match, search forward to find the closest matching tid + potential_op = None for i in range(index - 1, -1, -1): op = sorted_kineto_cpu_ops[i] if op.tid == kineto_gpu_op.tid: if "nccl" in kineto_gpu_op.name.lower() and op.name == "nccl:coalesced": continue # Skip 'nccl:coalesced' if it's an NCCL-related GPU operation - if op.timestamp <= ts: + if op.external_id == kineto_gpu_op.external_id: return op + if op.timestamp <= ts and potential_op is None: + potential_op = op # If no matching tid is found going forward, return None - return None + return closest_op def link_ops( self, host_op: PyTorchOperator, kineto_op: KinetoOperator, - cpu_ev_idx_to_gpu_ops_map: Dict[int, List[KinetoOperator]], + cpu_external_id_to_gpu_ops_map: Dict[int, List[KinetoOperator]], kineto_rf_id_to_device_op_map: Dict[int, KinetoOperator], kineto_external_id_to_kineto_op_map: Dict[int, KinetoOperator], ) -> Tuple[List[KinetoOperator], int, int, int, Optional[int]]: @@ -764,7 +766,7 @@ def link_ops( Args: host_op (PyTorchOperator): Chakra host operator to link. kineto_op (KinetoOperator): Corresponding Kineto operator. - cpu_ev_idx_to_gpu_ops_map (Dict[int, List[KinetoOperator]]): GPU ops mapping. + cpu_external_id_to_gpu_ops_map (Dict[int, List[KinetoOperator]]): GPU ops mapping. kineto_rf_id_to_device_op_map (Dict[int, KinetoOperator]): Kineto operator mapping. kineto_external_id_to_kineto_op_map (Dict[int, KinetoOperator]): Mapping from external id to KinetoOperators. @@ -779,7 +781,7 @@ def link_ops( - List[int]: List of synchronization dependency IDs. """ kineto_op.host_op = host_op - linked_gpu_ops = cpu_ev_idx_to_gpu_ops_map.get(kineto_op.ev_idx, []) + linked_gpu_ops = cpu_external_id_to_gpu_ops_map.get(kineto_op.external_id, []) inclusive_dur = kineto_op.inclusive_dur exclusive_dur = kineto_op.exclusive_dur timestamp = kineto_op.timestamp