From e85dd8c082244a331ec228e39149276c8f0c2742 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 5 Dec 2024 16:22:23 -0800 Subject: [PATCH 01/10] #2375: scripts: add lb_iterations to JSON data file schema validator --- scripts/LBDatafile_schema.py | 60 +++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/scripts/LBDatafile_schema.py b/scripts/LBDatafile_schema.py index a25f6d6e7e..1096750040 100644 --- a/scripts/LBDatafile_schema.py +++ b/scripts/LBDatafile_schema.py @@ -93,7 +93,65 @@ def validate_ids(field): 'bytes': float } ], - Optional('user_defined'): dict + Optional('user_defined'): dict, + Optional('lb_iterations'): [ + { + 'id': int, + 'tasks': [ + { + 'entity': And({ + Optional('collection_id'): int, + 'home': int, + Optional('id'): int, + Optional('seq_id'): int, + Optional('index'): [int], + 'type': str, + 'migratable': bool, + Optional('objgroup_id'): int + }, validate_ids), + 'node': int, + 'resource': str, + Optional('subphases'): [ + { + 'id': int, + 'time': float, + } + ], + 'time': float, + Optional('user_defined'): dict, + Optional('attributes'): dict + }, + ], + Optional('communications'): [ + { + 'type': str, + 'to': And({ + 'type': str, + Optional('id'): int, + Optional('seq_id'): int, + Optional('home'): int, + Optional('collection_id'): int, + Optional('migratable'): bool, + Optional('index'): [int], + Optional('objgroup_id'): int, + }, validate_ids), + 'messages': int, + 'from': And({ + 'type': str, + Optional('id'): int, + Optional('seq_id'): int, + Optional('home'): int, + Optional('collection_id'): int, + Optional('migratable'): bool, + Optional('index'): [int], + Optional('objgroup_id'): int, + }, validate_ids), + 'bytes': float + } + ], + Optional('user_defined'): dict + } + ] }, ] } From 70aa8b003c7fe663d9315a722af1d40cd2f6bada Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 18 Dec 2024 15:33:22 -0800 Subject: [PATCH 02/10] #2382: ccm-lb: add backoff to avoid performance issues w/locking --- .../balance/temperedlb/temperedlb.cc | 21 ++++++++++++------- .../balance/temperedlb/temperedlb.h | 6 ++++-- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 37ff65b224..de666e0114 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -1195,7 +1195,7 @@ void TemperedLB::doLBStages(LoadType start_imb) { double const memory_usage = computeMemoryUsage(); vt_debug_print( - terse, temperedlb, + normal, temperedlb, "Current memory info: total memory usage={}, shared blocks here={}, " "memory_threshold={}\n", memory_usage, getSharedBlocksHere().size(), mem_thresh_ @@ -1472,7 +1472,7 @@ void TemperedLB::informAsync() { if (is_overloaded_) { vt_debug_print( - terse, temperedlb, + normal, temperedlb, "TemperedLB::informAsync: trial={}, iter={}, known underloaded={}\n", trial_, iter_, underloaded_.size() ); @@ -2510,8 +2510,8 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) { vt_debug_print( normal, temperedlb, - "lockObtained: is_locked_={}, is_swapping_={}\n", - is_locked_, is_swapping_ + "lockObtained: is_locked_={}, is_swapping_={}, locking_rank_={}, msg->locked_node={}, is_swapping={}\n", + is_locked_, is_swapping_, locking_rank_, msg->locked_node, is_swapping_ ); auto cur_epoch = theMsg()->getEpoch(); @@ -2527,7 +2527,7 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) { if (is_locked_ && locking_rank_ <= msg->locked_node) { proxy_[msg->locked_node].template send<&TemperedLB::releaseLock>(); theTerm()->consume(cur_epoch); - try_locks_.emplace(msg->locked_node, msg->locked_c_try); + try_locks_.emplace(msg->locked_node, msg->locked_c_try, 1); //pending_actions_.push_back(action); } else if (is_locked_) { pending_actions_.push_back(action); @@ -2539,7 +2539,6 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) { "lockObtained: running action immediately\n" ); - action(); } } @@ -2551,7 +2550,8 @@ void TemperedLB::satisfyLockRequest() { for (auto&& tl : try_locks_) { vt_debug_print( verbose, temperedlb, - "satisfyLockRequest: node={}, c_try={}\n", tl.requesting_node, tl.c_try + "satisfyLockRequest: node={}, c_try={}, forced_release={}\n", + tl.requesting_node, tl.c_try, tl.forced_release ); } @@ -2559,6 +2559,13 @@ void TemperedLB::satisfyLockRequest() { auto lock = *iter; try_locks_.erase(iter); + if (lock.forced_release) { + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + lock.forced_release = false; + try_locks_.insert(lock); + return; + } + auto const this_node = theContext()->getNode(); vt_debug_print( diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index 92c89f36a7..22a7eab804 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -435,13 +435,15 @@ struct TemperedLB : BaseLB { ////////////////////////////////////////////////////////////////////////////// struct TryLock { - TryLock(NodeType in_requesting, double in_c_try) + TryLock(NodeType in_requesting, double in_c_try, int in_forced_release = 0) : requesting_node(in_requesting), - c_try(in_c_try) + c_try(in_c_try), + forced_release(in_forced_release) { } NodeType requesting_node = uninitialized_destination; double c_try = 0; + int forced_release = 0; double operator<(TryLock const& other) const { // sort in reverse order so the best is first! From 71f0ea53180834cb003e9f99b4d17b060fceca87 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 19 Dec 2024 14:12:57 -0800 Subject: [PATCH 03/10] #2382: ccm-lb: add max iter time and cycle lock count --- .../balance/temperedlb/temperedlb.cc | 25 +++++++++++++++---- .../balance/temperedlb/temperedlb.h | 3 +++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index de666e0114..9a4b2e8eab 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -1066,6 +1066,7 @@ void TemperedLB::doLBStages(LoadType start_imb) { for (iter_ = 0; iter_ < num_iters_; iter_++) { bool first_iter = iter_ == 0; + iter_time_ = MPI_Wtime(); if (first_iter) { // Copy this node's object assignments to a local, mutable copy @@ -1178,6 +1179,7 @@ void TemperedLB::doLBStages(LoadType start_imb) { is_overloaded_ = is_underloaded_ = false; ready_to_satisfy_locks_ = false; other_rank_clusters_.clear(); + cycle_locks_ = 0; // Not clearing shared_block_size_ because this never changes and // the knowledge might be useful @@ -1408,7 +1410,7 @@ void TemperedLB::loadStatsHandler(std::vector const& vec) { } void TemperedLB::rejectionStatsHandler( - int n_rejected, int n_transfers, int n_unhomed_blocks + int n_rejected, int n_transfers, int n_unhomed_blocks, int cycle_locks ) { double rej = static_cast(n_rejected) / static_cast(n_rejected + n_transfers) * 100.0; @@ -1419,8 +1421,18 @@ void TemperedLB::rejectionStatsHandler( terse, temperedlb, "TemperedLB::rejectionStatsHandler: n_transfers={} n_unhomed_blocks={}" " n_rejected={} " - "rejection_rate={:0.1f}%\n", - n_transfers, n_unhomed_blocks, n_rejected, rej + "rejection_rate={:0.1f}%, total_cycle_locks={}\n", + n_transfers, n_unhomed_blocks, n_rejected, rej, cycle_locks + ); + } +} + +void TemperedLB::maxIterTime(double max_iter_time) { + auto this_node = theContext()->getNode(); + if (this_node == 0) { + vt_debug_print( + terse, temperedlb, + "TemperedLB::maxIterTime: {}\n", max_iter_time ); } } @@ -2136,8 +2148,9 @@ void TemperedLB::originalTransfer() { // compute rejection rate because it will be printed runInEpochCollective("TemperedLB::originalTransfer -> compute rejection", [=] { proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>( - n_rejected, n_transfers, 0 + n_rejected, n_transfers, 0, 0 ); + proxy_.allreduce<&TemperedLB::maxIterTime, collective::MaxOp>(iter_time_); }); } } @@ -2525,6 +2538,7 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) { }; if (is_locked_ && locking_rank_ <= msg->locked_node) { + cycle_locks_++; proxy_[msg->locked_node].template send<&TemperedLB::releaseLock>(); theTerm()->consume(cur_epoch); try_locks_.emplace(msg->locked_node, msg->locked_c_try, 1); @@ -2708,8 +2722,9 @@ void TemperedLB::swapClusters() { auto remote_block_count = getRemoteBlockCountHere(); runInEpochCollective("TemperedLB::swapClusters -> compute rejection", [=] { proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>( - n_rejected, n_transfers_swap_, remote_block_count + n_rejected, n_transfers_swap_, remote_block_count, cycle_locks_, iter_time_ ); + proxy_.allreduce<&TemperedLB::maxIterTime, collective::MaxOp>(iter_time_); }); } } diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index 22a7eab804..55bdf6ba84 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -129,6 +129,7 @@ struct TemperedLB : BaseLB { void rejectionStatsHandler( int n_rejected, int n_transfers, int n_unhomed_blocks ); + void maxIterTime(double max_iter_time); void remoteBlockCountHandler(int n_unhomed_blocks); void thunkMigrations(); @@ -424,6 +425,8 @@ struct TemperedLB : BaseLB { StatisticMapType stats; LoadType this_load = 0.0f; LoadType this_work = 0.0f; + int cycle_locks_ = 0; + double iter_time_ = 0.0f; /// Whether any node has communication data bool has_comm_any_ = false; From 6cc5fd924fe2d02dcde40e6b2f142d1e1742695d Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 19 Dec 2024 14:15:25 -0800 Subject: [PATCH 04/10] #2382: ccm-lb: fix bug --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 2 +- src/vt/vrt/collection/balance/temperedlb/temperedlb.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 9a4b2e8eab..3eecfa2ba9 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -2722,7 +2722,7 @@ void TemperedLB::swapClusters() { auto remote_block_count = getRemoteBlockCountHere(); runInEpochCollective("TemperedLB::swapClusters -> compute rejection", [=] { proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>( - n_rejected, n_transfers_swap_, remote_block_count, cycle_locks_, iter_time_ + n_rejected, n_transfers_swap_, remote_block_count, cycle_locks_ ); proxy_.allreduce<&TemperedLB::maxIterTime, collective::MaxOp>(iter_time_); }); diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index 55bdf6ba84..ebf2614a43 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -127,7 +127,7 @@ struct TemperedLB : BaseLB { void loadStatsHandler(std::vector const& vec); void workStatsHandler(std::vector const& vec); void rejectionStatsHandler( - int n_rejected, int n_transfers, int n_unhomed_blocks + int n_rejected, int n_transfers, int n_unhomed_blocks, int cycle_count ); void maxIterTime(double max_iter_time); void remoteBlockCountHandler(int n_unhomed_blocks); From 8e14f8d1a474d5aeca92f39fc4b2e505e8b7d474 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 19 Dec 2024 14:40:27 -0800 Subject: [PATCH 05/10] #2382: ccm-lb: fix iter time so it's not additive --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 3eecfa2ba9..68afb25e33 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -2147,6 +2147,7 @@ void TemperedLB::originalTransfer() { if (theConfig()->vt_debug_temperedlb) { // compute rejection rate because it will be printed runInEpochCollective("TemperedLB::originalTransfer -> compute rejection", [=] { + iter_time_ = MPI_Wtime() - iter_time_; proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>( n_rejected, n_transfers, 0, 0 ); @@ -2721,6 +2722,7 @@ void TemperedLB::swapClusters() { int n_rejected = 0; auto remote_block_count = getRemoteBlockCountHere(); runInEpochCollective("TemperedLB::swapClusters -> compute rejection", [=] { + iter_time_ = MPI_Wtime() - iter_time_; proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>( n_rejected, n_transfers_swap_, remote_block_count, cycle_locks_ ); From 77190a39411ea32ff9649a7324554cb098bbfb18 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Mon, 23 Dec 2024 13:31:15 -0800 Subject: [PATCH 06/10] #2382: ccm-lb: implement incremental update for cluster summaries --- .../balance/temperedlb/temperedlb.cc | 244 +++++++++--------- .../balance/temperedlb/temperedlb.h | 9 + 2 files changed, 138 insertions(+), 115 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 68afb25e33..c61269b51c 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -638,131 +638,136 @@ void TemperedLB::readClustersMemoryData() { } } -void TemperedLB::computeClusterSummary() { - cur_clusters_.clear(); - - auto const this_node = theContext()->getNode(); - - for (auto const& [shared_id, shared_bytes] : shared_block_size_) { - auto const& [home_node, shared_volume] = shared_block_edge_[shared_id]; +ClusterInfo TemperedLB::makeClusterSummary(SharedIDType shared_id) { + auto const& [home_node, shared_volume] = shared_block_edge_[shared_id]; + auto const shared_bytes = shared_block_size_[shared_id] + + ClusterInfo info; + info.bytes = shared_bytes; + info.home_node = home_node; + info.edge_weight = shared_volume; + + std::set cluster_objs; + BytesType max_object_working_bytes = 0; + BytesType max_object_working_bytes_outside = 0; + BytesType max_object_serialized_bytes = 0; + BytesType max_object_serialized_bytes_outside = 0; + BytesType cluster_footprint = 0; + + for (auto const& [obj_id, obj_load] : cur_objs_) { + if (auto iter = obj_shared_block_.find(obj_id); iter != obj_shared_block_.end()) { + if (iter->second == shared_id) { + cluster_objs.insert(obj_id); + info.load += obj_load; + if ( + auto it = obj_working_bytes_.find(obj_id); + it != obj_working_bytes_.end() + ) { + max_object_working_bytes = std::max( + max_object_working_bytes, it->second + ); + } + if ( + auto it = obj_serialized_bytes_.find(obj_id); + it != obj_serialized_bytes_.end() + ) { + max_object_serialized_bytes = std::max( + max_object_serialized_bytes, it->second + ); + } + if ( + auto it = obj_footprint_bytes_.find(obj_id); + it != obj_footprint_bytes_.end() + ) { + cluster_footprint += it->second; + } + } else { + if ( + auto it = obj_working_bytes_.find(obj_id); + it != obj_working_bytes_.end() + ) { + max_object_working_bytes_outside = std::max( + max_object_working_bytes_outside, it->second + ); + } + if ( + auto it = obj_serialized_bytes_.find(obj_id); + it != obj_serialized_bytes_.end() + ) { + max_object_serialized_bytes_outside = std::max( + max_object_serialized_bytes_outside, it->second + ); + } + } + } + } - ClusterInfo info; - info.bytes = shared_bytes; - info.home_node = home_node; - info.edge_weight = shared_volume; + info.cluster_footprint = cluster_footprint; + info.max_object_working_bytes = max_object_working_bytes; + info.max_object_working_bytes_outside = max_object_working_bytes_outside; + info.max_object_serialized_bytes = max_object_serialized_bytes; + info.max_object_serialized_bytes_outside = max_object_serialized_bytes_outside; - std::set cluster_objs; - BytesType max_object_working_bytes = 0; - BytesType max_object_working_bytes_outside = 0; - BytesType max_object_serialized_bytes = 0; - BytesType max_object_serialized_bytes_outside = 0; - BytesType cluster_footprint = 0; + if (info.load != 0) { + for (auto&& obj : cluster_objs) { + if (auto it = send_edges_.find(obj); it != send_edges_.end()) { + for (auto const& [target, volume] : it->second) { + vt_debug_print( + verbose, temperedlb, + "computeClusterSummary: send obj={}, target={}\n", + obj, target + ); - for (auto const& [obj_id, obj_load] : cur_objs_) { - if (auto iter = obj_shared_block_.find(obj_id); iter != obj_shared_block_.end()) { - if (iter->second == shared_id) { - cluster_objs.insert(obj_id); - info.load += obj_load; - if ( - auto it = obj_working_bytes_.find(obj_id); - it != obj_working_bytes_.end() - ) { - max_object_working_bytes = std::max( - max_object_working_bytes, it->second - ); - } - if ( - auto it = obj_serialized_bytes_.find(obj_id); - it != obj_serialized_bytes_.end() - ) { - max_object_serialized_bytes = std::max( - max_object_serialized_bytes, it->second - ); - } - if ( - auto it = obj_footprint_bytes_.find(obj_id); - it != obj_footprint_bytes_.end() - ) { - cluster_footprint += it->second; - } - } else { - if ( - auto it = obj_working_bytes_.find(obj_id); - it != obj_working_bytes_.end() + if (cluster_objs.find(target) != cluster_objs.end()) { + // intra-cluster edge + info.intra_send_vol += volume; + } else if ( + cur_objs_.find(target) != cur_objs_.end() or + target.isLocatedOnThisNode() ) { - max_object_working_bytes_outside = std::max( - max_object_working_bytes_outside, it->second - ); + // intra-rank edge + info.inter_send_vol[this_node] += volume; + } else { + // inter-rank edge + info.inter_send_vol[target.getCurrNode()] += volume; } - if ( - auto it = obj_serialized_bytes_.find(obj_id); - it != obj_serialized_bytes_.end() + } + } + if (auto it = recv_edges_.find(obj); it != recv_edges_.end()) { + for (auto const& [target, volume] : it->second) { + vt_debug_print( + verbose, temperedlb, + "computeClusterSummary: recv obj={}, target={}\n", + obj, target + ); + if (cluster_objs.find(target) != cluster_objs.end()) { + // intra-cluster edge + info.intra_recv_vol += volume; + } else if ( + cur_objs_.find(target) != cur_objs_.end() or + target.isLocatedOnThisNode() ) { - max_object_serialized_bytes_outside = std::max( - max_object_serialized_bytes_outside, it->second - ); + // intra-rank edge + info.inter_recv_vol[this_node] += volume; + } else { + // inter-rank edge + info.inter_recv_vol[target.getCurrNode()] += volume; } } } } + } + return info; +} - info.cluster_footprint = cluster_footprint; - info.max_object_working_bytes = max_object_working_bytes; - info.max_object_working_bytes_outside = max_object_working_bytes_outside; - info.max_object_serialized_bytes = max_object_serialized_bytes; - info.max_object_serialized_bytes_outside = max_object_serialized_bytes_outside; - - if (info.load != 0) { - for (auto&& obj : cluster_objs) { - if (auto it = send_edges_.find(obj); it != send_edges_.end()) { - for (auto const& [target, volume] : it->second) { - vt_debug_print( - verbose, temperedlb, - "computeClusterSummary: send obj={}, target={}\n", - obj, target - ); - - if (cluster_objs.find(target) != cluster_objs.end()) { - // intra-cluster edge - info.intra_send_vol += volume; - } else if ( - cur_objs_.find(target) != cur_objs_.end() or - target.isLocatedOnThisNode() - ) { - // intra-rank edge - info.inter_send_vol[this_node] += volume; - } else { - // inter-rank edge - info.inter_send_vol[target.getCurrNode()] += volume; - } - } - } - if (auto it = recv_edges_.find(obj); it != recv_edges_.end()) { - for (auto const& [target, volume] : it->second) { - vt_debug_print( - verbose, temperedlb, - "computeClusterSummary: recv obj={}, target={}\n", - obj, target - ); - if (cluster_objs.find(target) != cluster_objs.end()) { - // intra-cluster edge - info.intra_recv_vol += volume; - } else if ( - cur_objs_.find(target) != cur_objs_.end() or - target.isLocatedOnThisNode() - ) { - // intra-rank edge - info.inter_recv_vol[this_node] += volume; - } else { - // inter-rank edge - info.inter_recv_vol[target.getCurrNode()] += volume; - } - } - } - } +void TemperedLB::computeClusterSummary() { + cur_clusters_.clear(); - cur_clusters_.emplace(shared_id, std::move(info)); - } + auto const this_node = theContext()->getNode(); + + for (auto const& [shared_id, _] : shared_block_size_) { + auto info = makeClusterSummary(shared_id); + cur_clusters_.emplace(shared_id, std::move(info)); } } @@ -2180,6 +2185,11 @@ auto TemperedLB::removeClusterToSend( if (shared_id != no_shared_id) { give_shared_blocks_size[shared_id] = shared_block_size_[shared_id]; + + // Update cur_clusters_ to avoid recomputing it + if (auto it = cur_clusters_.find(shared_id); it != cur_clusters_.end()) { + cur_clusters_.erase(it); + } } if (objs.size() == 0) { @@ -2406,7 +2416,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { ); }); - computeClusterSummary(); + //computeClusterSummary(); this_new_breakdown_ = computeWorkBreakdown(this_node, cur_objs_); this_new_work_ = this_new_breakdown_.work; computeMemoryUsage(); @@ -2483,7 +2493,11 @@ void TemperedLB::giveCluster( ); } - computeClusterSummary(); + auto id = give_shared_blocks_size.begin()->first; + auto info = makeClusterSummary(id); + cur_clusters_.emplace(id, std::move(info)) + + //computeClusterSummary(); this_new_breakdown_ = computeWorkBreakdown(this_node, cur_objs_); this_new_work_ = this_new_breakdown_.work; computeMemoryUsage(); diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index ebf2614a43..ca30561b4e 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -169,6 +169,15 @@ struct TemperedLB : BaseLB { */ void computeClusterSummary(); + /** + * \brief Make cluster summary info + * + * \param[in] shared_id the shared ID + * + * \return the info + */ + ClusterInfo makeClusterSummary(SharedIDType shared_id); + /** * \brief Try to lock a rank * From 56338f7552fe0b5e0cbd66799d6f382a7fa70bc1 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Mon, 23 Dec 2024 13:40:17 -0800 Subject: [PATCH 07/10] #2382: ccm-lb: fix some compile-time issues --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index c61269b51c..0b0730ec1d 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -639,8 +639,9 @@ void TemperedLB::readClustersMemoryData() { } ClusterInfo TemperedLB::makeClusterSummary(SharedIDType shared_id) { + auto const this_node = theContext()->getNode(); auto const& [home_node, shared_volume] = shared_block_edge_[shared_id]; - auto const shared_bytes = shared_block_size_[shared_id] + auto const shared_bytes = shared_block_size_[shared_id]; ClusterInfo info; info.bytes = shared_bytes; @@ -763,8 +764,6 @@ ClusterInfo TemperedLB::makeClusterSummary(SharedIDType shared_id) { void TemperedLB::computeClusterSummary() { cur_clusters_.clear(); - auto const this_node = theContext()->getNode(); - for (auto const& [shared_id, _] : shared_block_size_) { auto info = makeClusterSummary(shared_id); cur_clusters_.emplace(shared_id, std::move(info)); @@ -2495,7 +2494,7 @@ void TemperedLB::giveCluster( auto id = give_shared_blocks_size.begin()->first; auto info = makeClusterSummary(id); - cur_clusters_.emplace(id, std::move(info)) + cur_clusters_.emplace(id, std::move(info)); //computeClusterSummary(); this_new_breakdown_ = computeWorkBreakdown(this_node, cur_objs_); From 1889b346969a306f8840207e4e8091f14280f426 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Mon, 23 Dec 2024 14:09:25 -0800 Subject: [PATCH 08/10] #2382: ccm-lb: stop adding cluster when load is zero (empty cluster) --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 0b0730ec1d..8f30e27883 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -766,7 +766,9 @@ void TemperedLB::computeClusterSummary() { for (auto const& [shared_id, _] : shared_block_size_) { auto info = makeClusterSummary(shared_id); - cur_clusters_.emplace(shared_id, std::move(info)); + if (info.load != 0) { + cur_clusters_.emplace(shared_id, std::move(info)); + } } } From dc55d2c3a0e0d601281cf433ba519d585f3d6733 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Mon, 23 Dec 2024 14:20:24 -0800 Subject: [PATCH 09/10] #2382: ccm-lb: another optimization --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 8f30e27883..9fadcf94d0 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -1213,7 +1213,9 @@ void TemperedLB::doLBStages(LoadType start_imb) { vtAbort("This should never be possible to go over the threshold\n"); } - computeClusterSummary(); + if (iter_ == 0) { + computeClusterSummary(); + } // Verbose printing about local clusters for (auto const& [shared_id, cluster_info] : cur_clusters_) { From 900f9aa7ee3ed202849af6c3e1ce1f57825cb59f Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Mon, 23 Dec 2024 14:33:28 -0800 Subject: [PATCH 10/10] #2382: ccm-lb: reduce sleep to 100us --- src/vt/vrt/collection/balance/temperedlb/temperedlb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 9fadcf94d0..539b7f6a6c 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -2592,7 +2592,7 @@ void TemperedLB::satisfyLockRequest() { try_locks_.erase(iter); if (lock.forced_release) { - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::this_thread::sleep_for(std::chrono::microseconds(100)); lock.forced_release = false; try_locks_.insert(lock); return;