Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: #48: vt and magistrate support #49

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,15 @@ set_property(TARGET resilience PROPERTY CXX_STANDARD ${Kokkos_CXX_STANDARD})
target_link_libraries(resilience PUBLIC Kokkos::kokkos)

option(KR_ENABLE_VELOC "use VeloC backend for automatic checkpointing" ON)
option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" ON)
option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" OFF)

option(KR_ENABLE_MAGISTRATE "use Magistrate for serializing and deserializing" OFF)
option(KR_ENABLE_RESILIENT_EXEC "enable resilient execution spaces" OFF)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah just noticed this is wrong, see KR_ENABLE_EXEC_SPACES below. I think I messed this up in the rebase somewhere


option(KR_ENABLE_VT "use VT for backend coordination" OFF)

option(KR_ENABLE_OPENMP "enable the resilient OpenMP execution space" OFF)
option(KR_ENABLE_CUDA "enable the resilient CUDA execution space" OFF)

include(CMakeDependentOption)

Expand All @@ -55,12 +63,23 @@ if (KR_ENABLE_VELOC)
endif()
endif()

if (KR_ENABLE_VT)
find_package(vt REQUIRED)
target_link_libraries(resilience PUBLIC vt::runtime::vt)
target_compile_definitions(resilience PUBLIC KR_ENABLE_VT)

set(KR_ENABLE_MPI_BACKENDS ON)
set(KR_ENABLE_MAGISTRATE ON)
endif()

# StdFile backend
if (KR_ENABLE_STDFILE)
target_compile_definitions(resilience PUBLIC KR_ENABLE_STDFILE)
endif()

if (KR_ENABLE_MPI_BACKENDS)
find_package(MPI REQUIRED)
target_link_libraries(resilience PRIVATE MPI::MPI_CXX)
target_compile_definitions(resilience PUBLIC KR_ENABLE_MPI_BACKENDS)
endif()

Expand All @@ -70,6 +89,12 @@ if (KR_ENABLE_TRACING)
target_compile_definitions(resilience PUBLIC KR_ENABLE_TRACING)
endif()

if (KR_ENABLE_MAGISTRATE)
find_package(checkpoint REQUIRED)
target_link_libraries(resilience PUBLIC vt::lib::checkpoint)
target_compile_definitions(resilience PUBLIC KR_ENABLE_MAGISTRATE)
endif()

option( KR_ENABLE_STDIO "use stdio for manual checkpoint" OFF )
option( KR_ENABLE_HDF5 "add HDF5 support" OFF )
option( KR_ENABLE_HDF5_PARALLEL "use parallel version of HDF5" OFF )
Expand Down
3 changes: 2 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
"KR_ENABLE_TESTS": "ON",
"KR_ENABLE_EXAMPLES": "ON",
"KR_ALL_WARNINGS": "ON",
"KR_WARNINGS_AS_ERRORS": "ON"
"KR_WARNINGS_AS_ERRORS": "ON",
"KR_ENABLE_STDFILE": "OFF"
}
}
],
Expand Down
9 changes: 8 additions & 1 deletion cmake/resilienceConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ include("${CMAKE_CURRENT_LIST_DIR}/resilienceTargets.cmake")

SET(KR_ENABLE_HDF5 @KR_ENABLE_HDF5@)
SET(KR_ENABLE_VELOC @KR_ENABLE_VELOC@)
SET(KR_ENABLE_MAGISTRATE @KR_ENABLE_MAGISTRATE@)

# VeloC needs to add a cmake config...
LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/Modules/")
LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../Modules/" "${CMAKE_CURRENT_LIST_DIR}/../cmake/Modules/")
message(STATUS "Module path: ${CMAKE_MODULE_PATH}")

find_dependency(Kokkos REQUIRED NO_CMAKE_PACKAGE_REGISTRY HINTS @Kokkos_DIR@)
Expand All @@ -25,5 +26,11 @@ if (@KR_ENABLE_HDF5@)
find_dependency(HDF5 REQUIRED)
endif()

if (@KR_ENABLE_MAGISTRATE@)
set(CHECKPOINT_DIR @CHECKPOINT_DIR@)
find_dependency(checkpoint REQUIRED)
set(KR_ENABLE_MAGISTRATE @KR_ENABLE_MAGISTRATE@)
endif()

set(Boost_DIR @Boost_DIR@)
find_dependency(Boost REQUIRED)
13 changes: 12 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@ function(add_example _target)
target_resources(${_target} PRIVATE ${ARG_RESOURCES})
target_link_libraries(${_target} PRIVATE Kokkos::resilience)
target_link_libraries(${_target} PRIVATE cxxopts::cxxopts)
if (KR_ENABLE_VELOC OR KR_ENABLE_HDF5_PARALLEL)
if (KR_ENABLE_MPI_BACKENDS)
target_link_libraries(${_target} PRIVATE MPI::MPI_CXX)
endif()
if (KR_ENABLE_MAGISTRATE)
target_link_libraries(${_target} PRIVATE vt::lib::checkpoint)
endif()
if (KR_ENABLE_VT)
target_link_libraries(${_target} PRIVATE vt::runtime::vt)
endif()

if (KR_WARNINGS_AS_ERRORS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand All @@ -42,3 +48,8 @@ if (KR_ENABLE_STDFILE)
add_example(simple_file_checkpoint SOURCES SimpleFileCheckpoint.cpp
RESOURCES config_file.json file_test.cfg)
endif()

if(KR_ENABLE_VT)
add_example(jacobi_checkpoint SOURCES jacobi/main.cpp jacobi/solver.cpp jacobi/config.cpp
RESOURCES jacobi/config_jacobi.json jacobi/config_jacobi_async.json jacobi/config_jacobi_1.json jacobi/config_jacobi_more_async.json)
endif()
19 changes: 14 additions & 5 deletions examples/SimpleCheckpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@

#include <mpi.h>
#include <Kokkos_Core.hpp>
#include <resilience/Context.hpp>
#include <resilience/veloc/VelocBackend.hpp>
#include <resilience/AutomaticCheckpoint.hpp>
#include <resilience/Resilience.hpp>

using chkpt_view = Kokkos::Experimental::SubscribableViewHooks<KokkosResilience::DynamicViewHooksSubscriber>;

int
main( int argc, char **argv )
Expand All @@ -60,15 +60,24 @@ main( int argc, char **argv )
auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config.json" );

int dim0 = 5, dim1 = 5;
auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 );
auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 );

KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() {
Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) {
for ( int j = 0; j < dim1; ++j )
view( i, j ) = 3.0;
} );
} );
});

for(int i = 0; i < dim0; i++){
for(int j = 0; j < dim1; j++){
if(view(i,j) != 3.0) {
fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0);
exit(1);
}
}
}
printf("Success!\n");
}
Kokkos::finalize();

Expand Down
27 changes: 21 additions & 6 deletions examples/SimpleFileCheckpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,42 @@
#endif

#include <Kokkos_Core.hpp>
#include <resilience/Context.hpp>
#include <resilience/stdfile/StdFileBackend.hpp>
#include <resilience/AutomaticCheckpoint.hpp>
#include <resilience/Resilience.hpp>
#include <mpi.h>

using chkpt_view = Kokkos::Experimental::SubscribableViewHooks<KokkosResilience::DynamicViewHooksSubscriber>;

int
main( int argc, char **argv )
{
MPI_Init( &argc, &argv );

Kokkos::initialize( argc, argv );
{
auto ctx = KokkosResilience::make_context( "checkpoint.data", "config_file.json" );
auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config_file.json" );

int dim0 = 5, dim1 = 5;
auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 );
auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 );

KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() {
Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) {
for ( int j = 0; j < dim1; ++j )
view( i, j ) = 3.0;
} );
} );
}, [](int){return true;} );

for(int i = 0; i < dim0; i++){
for(int j = 0; j < dim1; j++){
if(view(i,j) != 3.0) {
fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0);
exit(1);
}
}
}
printf("Success!\n");

}
Kokkos::finalize();

MPI_Finalize();
}
2 changes: 1 addition & 1 deletion examples/benchmark_multiviews.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) {
wtime = MPI_Wtime();
std::size_t i = 1 + KokkosResilience::latest_version(*ctx, "test_kokkos");

while(i < nsteps) {
while(i < nsteps ) {

KokkosResilience::checkpoint(*ctx, "test_kokkos", i, [=]() { // Nic, tell me what should I put for []/

Expand Down
5 changes: 3 additions & 2 deletions examples/config_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
"backend": "stdfile",
"backends": {
"stdfile": {
"config": "file_test.cfg"
"directory": "./stdfile_chkpts/",
"filename_prefix": "simple_"
}
},
"filter": {
"type": "time",
"interval": 10
}
}
}
122 changes: 122 additions & 0 deletions examples/jacobi/config.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#include "config.hpp"
#include <vt/transport.h>

namespace Jacobi {
Config::Config(int argc, char** argv){
for(int i = 0; i < argc; i++){
std::string arg = argv[i];
if( arg == "--decomp"){
int x = std::stoi(argv[++i]);
int y = std::stoi(argv[++i]);
int z = std::stoi(argv[++i]);
colRange = vt::Index3D(x,y,z);
} else if(arg == "--input"){
int x = std::stoi(argv[++i]);
int y = std::stoi(argv[++i]);
int z = std::stoi(argv[++i]);
dataRange = vt::Index3D(x,y,z);
} else if(arg == "--max-iters") {
maxIter = std::stoi(argv[++i]);
} else if(arg == "--tolerance") {
tolerance = std::stod(argv[++i]);
} else if(arg == "--async-serialize") {
asyncCheckpoint = true;
}
}

/* --- Print information about the simulation */
if(vt::theContext()->getNode() == 0){
fmt::print(
stdout, "\n - Solve the linear system for the Laplacian with homogeneous Dirichlet"
" on [0, 1] x [0, 1] x [0, 1]\n"
);
fmt::print(" - Second-order centered finite difference\n");
fmt::print(" - {} elements decomposed onto {} objects.\n", dataRange.toString(), colRange.toString());
fmt::print(" - Maximum number of iterations {}\n", maxIter);
fmt::print(" - Convergence tolerance {}\n", tolerance);
fmt::print("\n");
}
}
}

ResilienceConfig::ResilienceConfig(int argc, char** argv, Jacobi::Config app_cfg){
for(int i = 0; i < argc; i++){
std::string arg = argv[i];
if(arg == "--config")
config_filename = argv[++i];
else if(arg == "--mode")
context_mode = argv[++i];
else if(arg == "--freq")
checkpoint_frequency = std::stoi(argv[++i]);
else if(arg == "--kill")
kill_iter = std::stoi(argv[++i]);
else if(arg == "--kill-rank")
kill_rank = std::stoi(argv[++i]);
else if(arg == "--iters-per-phase")
iters_per_phase = std::stoi(argv[++i]);
else if(arg == "--iters-per-epoch")
iters_per_epoch = std::stoi(argv[++i]);
}


if(context_mode == "VT") {
if(iters_per_epoch == 0) iters_per_epoch = -1;
context = kr::make_context(vt::theContext(), config_filename);
} else if(context_mode == "MPI"){
if(iters_per_epoch == 0){
iters_per_epoch = checkpoint_frequency;
//Can't infer both iters_per_epoch and checkpoint_frequency
assert(checkpoint_frequency != 0);
}
context = kr::make_context(MPI_COMM_WORLD, config_filename);
} else throw std::invalid_argument("Valid --mode values are VT or MPI");

std::string freq_str;
if(checkpoint_frequency < 0) {
freq_str = "never";
checkpoint_filter = [](int iter){ return false; };
} else if(checkpoint_frequency == 0){
freq_str = "according to json";
checkpoint_filter = context->default_filter();
} else {
freq_str = fmt::format("every {} iterations", checkpoint_frequency);
checkpoint_filter = kr::Filter::NthIterationFilter(checkpoint_frequency);
}


if(iters_per_phase < 1) iters_per_phase = app_cfg.maxIter+1;
if(iters_per_epoch < 1) iters_per_epoch = app_cfg.maxIter+1;


if(vt::theContext()->getNode() == 0) {
fmt::print("kr:: {} Context configured against {}\n", context_mode, config_filename);
fmt::print("kr:: Checkpointing {}\n", freq_str);
if(kill_iter > 0 && kill_rank > 0){
fmt::print("Generating failure at iteration {} on rank {}\n", kill_iter, kill_rank);
if(kill_rank >= vt::theContext()->getNumNodes()){
fmt::print("WARNING: kill_rank {} does not exist!\n", kill_rank);
}
}

if(iters_per_epoch == -1){
fmt::print("kr:: instructing app not to bound iterations\n");
} else {
fmt::print("kr:: instructing app to bound every {} iterations\n", iters_per_epoch);
}

if(iters_per_phase == -1){
fmt::print("kr:: instructing app not to use phases\n");
} else {
fmt::print("kr:: instructing app to phase every {} iterations\n", iters_per_phase);
}
}
}

void ResilienceConfig::try_kill(int current_iteration){
if(kill_iter == current_iteration &&
kill_rank == vt::theContext()->getNode()){
fmt::print(stderr, "Rank {} simulating failure on iteration {}\n",
kill_rank, kill_iter);
exit(1);
}
};
Loading
Loading