From e28e7ef522dbcd49c4e5e8fcc202e1a0d6cbe0f2 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Fri, 10 Jan 2025 16:10:28 -0500 Subject: [PATCH 1/5] #2387: initial commit (wip); store hashed events in theTrace() and attempt allreduce at end of run --- src/vt/trace/trace.cc | 4 ++++ src/vt/trace/trace.h | 8 ++++++++ src/vt/trace/trace_lite.cc | 9 +++++++++ src/vt/trace/trace_lite.h | 1 + src/vt/trace/trace_user_event.cc | 7 ++----- 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/vt/trace/trace.cc b/src/vt/trace/trace.cc index c4d2ed0f1c..3b23b544a2 100644 --- a/src/vt/trace/trace.cc +++ b/src/vt/trace/trace.cc @@ -220,6 +220,10 @@ void insertNewUserEvent( #endif } +void Trace::addHashedEventMsg(Message event_msg) { + theTrace()->user_hashed_events_.push_back(event_msg); +} + void Trace::addUserEvent(UserEventIDType event) { if (not checkDynamicRuntimeEnabled()) { return; diff --git a/src/vt/trace/trace.h b/src/vt/trace/trace.h index 232f36d4e5..fede657ce6 100644 --- a/src/vt/trace/trace.h +++ b/src/vt/trace/trace.h @@ -220,6 +220,13 @@ struct Trace : runtime::component::Component, TraceLite { */ void registerUserEventManual(std::string const& name, UserSpecEventIDType id); + /** + * \brief Store a hashed event + * + * \param[in] event_msg the hashed event message + */ + void addHashedEventMsg(Message event_msg); + /** * \brief Log a user event * @@ -374,6 +381,7 @@ struct Trace : runtime::component::Component, TraceLite { | idle_begun_ | start_time_ | user_event_ + | user_hashed_events_ | prog_name_ | trace_name_ | full_trace_name_ diff --git a/src/vt/trace/trace_lite.cc b/src/vt/trace/trace_lite.cc index 8777585e4b..d7b65de442 100644 --- a/src/vt/trace/trace_lite.cc +++ b/src/vt/trace/trace_lite.cc @@ -543,6 +543,15 @@ void TraceLite::flushTracesFile(bool useGlobalSync) { void TraceLite::writeTracesFile(int flush, bool is_incremental_flush) { auto const node = theContext()->getNode(); + // Allreduce the hashed events to rank 0 before writing sts file + auto const root = 0; + std::vector all_hashed_events; + auto msg = makeMessage>( + theTrace()->user_hashed_events_); + theCollective()->global()->reduce< + PlusOp>, Verify + >(root, msg.get()); + size_t to_write = traces_.size(); if (traceWritingEnabled(node) and to_write > 0) { diff --git a/src/vt/trace/trace_lite.h b/src/vt/trace/trace_lite.h index 096a506b45..62ee0e158b 100644 --- a/src/vt/trace/trace_lite.h +++ b/src/vt/trace/trace_lite.h @@ -427,6 +427,7 @@ struct TraceLite { int incremental_flush_mode_ = 0; UserEventRegistry user_event_ = {}; + std::vector user_hashed_events_; EventHoldStackType event_holds_; TraceStackType open_events_; TraceContainerType traces_; diff --git a/src/vt/trace/trace_user_event.cc b/src/vt/trace/trace_user_event.cc index e707c1c840..d394579881 100644 --- a/src/vt/trace/trace_user_event.cc +++ b/src/vt/trace/trace_user_event.cc @@ -84,11 +84,8 @@ UserEventIDType UserEventRegistry::hash(std::string const& in_event_name) { auto id = std::get<0>(ret); auto inserted = std::get<1>(ret); if (inserted) { - auto const node = theContext()->getNode(); - if (node != 0) { - auto msg = makeMessage(false, id, in_event_name); - theMsg()->sendMsg(0, msg); - } + auto msg = makeMessage(false, id, in_event_name); + theTrace->addHashedEventMsg(msg); } return id; } From 779e46ddafd04ab50722091cf4b1cfd521249750 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Mon, 13 Jan 2025 12:35:24 -0500 Subject: [PATCH 2/5] #2387: store event id instead of msg (wip) --- src/vt/trace/trace.cc | 4 ++-- src/vt/trace/trace.h | 6 +++--- src/vt/trace/trace_lite.cc | 6 +++--- src/vt/trace/trace_lite.h | 2 +- src/vt/trace/trace_user_event.cc | 3 +-- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/vt/trace/trace.cc b/src/vt/trace/trace.cc index 3b23b544a2..d5e1077dab 100644 --- a/src/vt/trace/trace.cc +++ b/src/vt/trace/trace.cc @@ -220,8 +220,8 @@ void insertNewUserEvent( #endif } -void Trace::addHashedEventMsg(Message event_msg) { - theTrace()->user_hashed_events_.push_back(event_msg); +void Trace::addHashedEvent(UserEventIDType event_id) { + theTrace()->user_hashed_events_.push_back(event_id); } void Trace::addUserEvent(UserEventIDType event) { diff --git a/src/vt/trace/trace.h b/src/vt/trace/trace.h index fede657ce6..87cc998664 100644 --- a/src/vt/trace/trace.h +++ b/src/vt/trace/trace.h @@ -221,11 +221,11 @@ struct Trace : runtime::component::Component, TraceLite { void registerUserEventManual(std::string const& name, UserSpecEventIDType id); /** - * \brief Store a hashed event + * \brief Store a hashed event ID * - * \param[in] event_msg the hashed event message + * \param[in] event_id the hashed event ID */ - void addHashedEventMsg(Message event_msg); + void addHashedEvent(UserEventIDType event_id); /** * \brief Log a user event diff --git a/src/vt/trace/trace_lite.cc b/src/vt/trace/trace_lite.cc index d7b65de442..281daad40d 100644 --- a/src/vt/trace/trace_lite.cc +++ b/src/vt/trace/trace_lite.cc @@ -545,11 +545,11 @@ void TraceLite::writeTracesFile(int flush, bool is_incremental_flush) { // Allreduce the hashed events to rank 0 before writing sts file auto const root = 0; - std::vector all_hashed_events; - auto msg = makeMessage>( + std::vector all_hashed_events; + auto msg = makeMessage>( theTrace()->user_hashed_events_); theCollective()->global()->reduce< - PlusOp>, Verify + PlusOp>, Verify >(root, msg.get()); size_t to_write = traces_.size(); diff --git a/src/vt/trace/trace_lite.h b/src/vt/trace/trace_lite.h index 62ee0e158b..1924f6c2ef 100644 --- a/src/vt/trace/trace_lite.h +++ b/src/vt/trace/trace_lite.h @@ -427,7 +427,7 @@ struct TraceLite { int incremental_flush_mode_ = 0; UserEventRegistry user_event_ = {}; - std::vector user_hashed_events_; + std::vector user_hashed_events_; EventHoldStackType event_holds_; TraceStackType open_events_; TraceContainerType traces_; diff --git a/src/vt/trace/trace_user_event.cc b/src/vt/trace/trace_user_event.cc index d394579881..c83c604343 100644 --- a/src/vt/trace/trace_user_event.cc +++ b/src/vt/trace/trace_user_event.cc @@ -84,8 +84,7 @@ UserEventIDType UserEventRegistry::hash(std::string const& in_event_name) { auto id = std::get<0>(ret); auto inserted = std::get<1>(ret); if (inserted) { - auto msg = makeMessage(false, id, in_event_name); - theTrace->addHashedEventMsg(msg); + theTrace->addHashedEvent(id); } return id; } From d25f3f268b58a4c70ba7f4bfd85b5f61c823c8aa Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Mon, 13 Jan 2025 15:52:02 -0500 Subject: [PATCH 3/5] #2387: use gather instead of allreduce --- src/vt/trace/trace_lite.cc | 42 ++++++++++++++++++++++++++------ src/vt/trace/trace_user_event.cc | 2 +- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/vt/trace/trace_lite.cc b/src/vt/trace/trace_lite.cc index 281daad40d..1f082d8fb9 100644 --- a/src/vt/trace/trace_lite.cc +++ b/src/vt/trace/trace_lite.cc @@ -59,6 +59,7 @@ #include #include #include +#include namespace vt { #if vt_check_enabled(trace_only) @@ -542,15 +543,42 @@ void TraceLite::flushTracesFile(bool useGlobalSync) { void TraceLite::writeTracesFile(int flush, bool is_incremental_flush) { auto const node = theContext()->getNode(); + auto const comm = theContext()->getComm(); + auto const comm_size = theContext()->getNumNodes(); - // Allreduce the hashed events to rank 0 before writing sts file + // Gather all hashed events to rank 0 before writing sts file + using events_t = std::vector; auto const root = 0; - std::vector all_hashed_events; - auto msg = makeMessage>( - theTrace()->user_hashed_events_); - theCollective()->global()->reduce< - PlusOp>, Verify - >(root, msg.get()); + events_t local_hashed_events = theTrace()->user_hashed_events_; + int local_size = local_hashed_events.size(); + std::vector all_sizes(comm_size); + MPI_Gather(&local_size, 1, MPI_INT, all_sizes.data(), 1, MPI_INT, 0, comm); + + // Compute displacements + std::vector displs(comm_size, 0); + if (node == 0) { + std::partial_sum(all_sizes.begin(), all_sizes.end() - 1, displs.begin() + 1); + } + + // Create vector in which to store all events + events_t all_hashed_events; + if (node == 0) { + int total_size = std::accumulate(all_sizes.begin(), all_sizes.end(), 0); + all_hashed_events.resize(total_size); + } + + // Gather events + MPI_Gatherv( + local_hashed_events.data(), // Send buffer + local_size, // Number of elements to send + MPI_UINT32_T, // Data type (adjust to match UserEventIDType) + all_hashed_events.data(), // Receive buffer (on root) + all_sizes.data(), // Number of elements to receive from each rank + displs.data(), // Displacements for each rank + MPI_UINT32_T, // Data type (adjust to match UserEventIDType) + root, // Root node + comm // Communicator + ); size_t to_write = traces_.size(); diff --git a/src/vt/trace/trace_user_event.cc b/src/vt/trace/trace_user_event.cc index c83c604343..e5e3f7158b 100644 --- a/src/vt/trace/trace_user_event.cc +++ b/src/vt/trace/trace_user_event.cc @@ -84,7 +84,7 @@ UserEventIDType UserEventRegistry::hash(std::string const& in_event_name) { auto id = std::get<0>(ret); auto inserted = std::get<1>(ret); if (inserted) { - theTrace->addHashedEvent(id); + vt::theTrace()->addHashedEvent(id); } return id; } From e4e471f0561e4f7f373b50ca334e6ce21398913d Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Tue, 14 Jan 2025 12:24:34 -0500 Subject: [PATCH 4/5] #2387: overload + op for UserEventRegistry and implement gather for user_event_ --- src/vt/trace/trace.cc | 4 ---- src/vt/trace/trace.h | 8 ------- src/vt/trace/trace_lite.cc | 37 +++----------------------------- src/vt/trace/trace_lite.h | 1 - src/vt/trace/trace_user_event.cc | 23 ++++++++------------ 5 files changed, 12 insertions(+), 61 deletions(-) diff --git a/src/vt/trace/trace.cc b/src/vt/trace/trace.cc index d5e1077dab..c4d2ed0f1c 100644 --- a/src/vt/trace/trace.cc +++ b/src/vt/trace/trace.cc @@ -220,10 +220,6 @@ void insertNewUserEvent( #endif } -void Trace::addHashedEvent(UserEventIDType event_id) { - theTrace()->user_hashed_events_.push_back(event_id); -} - void Trace::addUserEvent(UserEventIDType event) { if (not checkDynamicRuntimeEnabled()) { return; diff --git a/src/vt/trace/trace.h b/src/vt/trace/trace.h index 87cc998664..232f36d4e5 100644 --- a/src/vt/trace/trace.h +++ b/src/vt/trace/trace.h @@ -220,13 +220,6 @@ struct Trace : runtime::component::Component, TraceLite { */ void registerUserEventManual(std::string const& name, UserSpecEventIDType id); - /** - * \brief Store a hashed event ID - * - * \param[in] event_id the hashed event ID - */ - void addHashedEvent(UserEventIDType event_id); - /** * \brief Log a user event * @@ -381,7 +374,6 @@ struct Trace : runtime::component::Component, TraceLite { | idle_begun_ | start_time_ | user_event_ - | user_hashed_events_ | prog_name_ | trace_name_ | full_trace_name_ diff --git a/src/vt/trace/trace_lite.cc b/src/vt/trace/trace_lite.cc index 1f082d8fb9..d52661cf73 100644 --- a/src/vt/trace/trace_lite.cc +++ b/src/vt/trace/trace_lite.cc @@ -59,7 +59,6 @@ #include #include #include -#include namespace vt { #if vt_check_enabled(trace_only) @@ -546,39 +545,9 @@ void TraceLite::writeTracesFile(int flush, bool is_incremental_flush) { auto const comm = theContext()->getComm(); auto const comm_size = theContext()->getNumNodes(); - // Gather all hashed events to rank 0 before writing sts file - using events_t = std::vector; - auto const root = 0; - events_t local_hashed_events = theTrace()->user_hashed_events_; - int local_size = local_hashed_events.size(); - std::vector all_sizes(comm_size); - MPI_Gather(&local_size, 1, MPI_INT, all_sizes.data(), 1, MPI_INT, 0, comm); - - // Compute displacements - std::vector displs(comm_size, 0); - if (node == 0) { - std::partial_sum(all_sizes.begin(), all_sizes.end() - 1, displs.begin() + 1); - } - - // Create vector in which to store all events - events_t all_hashed_events; - if (node == 0) { - int total_size = std::accumulate(all_sizes.begin(), all_sizes.end(), 0); - all_hashed_events.resize(total_size); - } - - // Gather events - MPI_Gatherv( - local_hashed_events.data(), // Send buffer - local_size, // Number of elements to send - MPI_UINT32_T, // Data type (adjust to match UserEventIDType) - all_hashed_events.data(), // Receive buffer (on root) - all_sizes.data(), // Number of elements to receive from each rank - displs.data(), // Displacements for each rank - MPI_UINT32_T, // Data type (adjust to match UserEventIDType) - root, // Root node - comm // Communicator - ); + vt::runInEpochCollective([&]{ + proxy.reduce(0, std::move(user_event_)); + }); size_t to_write = traces_.size(); diff --git a/src/vt/trace/trace_lite.h b/src/vt/trace/trace_lite.h index 1924f6c2ef..096a506b45 100644 --- a/src/vt/trace/trace_lite.h +++ b/src/vt/trace/trace_lite.h @@ -427,7 +427,6 @@ struct TraceLite { int incremental_flush_mode_ = 0; UserEventRegistry user_event_ = {}; - std::vector user_hashed_events_; EventHoldStackType event_holds_; TraceStackType open_events_; TraceContainerType traces_; diff --git a/src/vt/trace/trace_user_event.cc b/src/vt/trace/trace_user_event.cc index e5e3f7158b..3232ea0228 100644 --- a/src/vt/trace/trace_user_event.cc +++ b/src/vt/trace/trace_user_event.cc @@ -82,21 +82,12 @@ UserEventIDType UserEventRegistry::hash(std::string const& in_event_name) { id_hash = id_hash & 0x0FFF; auto ret = newEventImpl(false, false, in_event_name, id_hash, true); auto id = std::get<0>(ret); - auto inserted = std::get<1>(ret); - if (inserted) { - vt::theTrace()->addHashedEvent(id); - } return id; } UserEventIDType UserEventRegistry::rooted(std::string const& in_event_name) { auto ret = newEventImpl(false, true, in_event_name, cur_root_event_++); auto id = std::get<0>(ret); - auto const node = theContext()->getNode(); - if (node != 0) { - auto msg = makeMessage(false, id, in_event_name); - theMsg()->sendMsg(0, msg); - } return id; } @@ -105,11 +96,6 @@ UserEventIDType UserEventRegistry::user( ) { auto ret = newEventImpl(true, false, in_event_name, seq); auto id = std::get<0>(ret); - auto const node = theContext()->getNode(); - if (node != 0) { - auto msg = makeMessage(true, id, in_event_name); - theMsg()->sendMsg(0, msg); - } return id; } #endif @@ -141,9 +127,18 @@ bool UserEventRegistry::insertEvent( ); return true; } else { + user_event_[event] += " COLLISION " + name; return false; } } +UserEventRegistry operator+( + UserEventRegistry r1, UserEventRegistry const& r2 +) { + for (auto& [hash, event_str] : r2.getEvents()) { + r1.insertEvent(hash, event_str); + } + return r1; +} }} /* end namespace vt::trace */ From 2daccb62f69dc24d84d0918a8fb1c53e7043fe54 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Tue, 14 Jan 2025 12:25:33 -0500 Subject: [PATCH 5/5] #2387: remove unneeded comm info --- src/vt/trace/trace_lite.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vt/trace/trace_lite.cc b/src/vt/trace/trace_lite.cc index d52661cf73..db68dcd6e2 100644 --- a/src/vt/trace/trace_lite.cc +++ b/src/vt/trace/trace_lite.cc @@ -542,8 +542,6 @@ void TraceLite::flushTracesFile(bool useGlobalSync) { void TraceLite::writeTracesFile(int flush, bool is_incremental_flush) { auto const node = theContext()->getNode(); - auto const comm = theContext()->getComm(); - auto const comm_size = theContext()->getNumNodes(); vt::runInEpochCollective([&]{ proxy.reduce(0, std::move(user_event_));