diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 2eeabadeb..de3b2d5ce 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -54,7 +54,7 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { } } } - + } template < size_t block_size > @@ -65,6 +65,17 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { MASS3DEA_DATA_SETUP; + //Extra kernel launch ... + { + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); + } + switch (vid) { case Base_HIP: { @@ -89,7 +100,7 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; @@ -99,72 +110,71 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { using inner_z = RAJA::LoopPolicy>; + using inner_zyx = RAJA::LoopPolicy; + + using inner_yxz = RAJA::LoopPolicy; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +#if 1 RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](int e) { + [&](int e) { MASS3DEA_0 - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), - [&](int d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), - [&](int q) { - MASS3DEA_1 - } - ); // RAJA::loop - } - ); // RAJA::loop - } - ); // RAJA::loop - + RAJA::expt::loop + (ctx, RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, 1), + [&](int q, int d, int ) { + MASS3DEA_1 + } + ); // RAJA::loop MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), - [&](int k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), - [&](int k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), - [&](int k3) { - MASS3DEA_3 - } - ); // RAJA::loop - } - ); // RAJA::loop - } - ); // RAJA::loop + RAJA::expt::loop + (ctx, RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_Q1D), + [&](int k3, int k2, int k1) { + MASS3DEA_3 + } + ); // RAJA::loop + ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), - [&](int i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), - [&](int i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), - [&](int i3) { - MASS3DEA_4 - } - ); // RAJA::loop - } - ); // RAJA::loop - } - ); // RAJA::loop - - } // lambda (e) - ); // RAJA::loop + RAJA::expt::loop + (ctx, RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, MEA_D1D), + [&](int i3, int i2, int i1) { + MASS3DEA_4 + } + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop } // outer lambda (ctx) ); // RAJA::launch +#else + + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); + + + +#endif + + } // loop over kernel reps stopTimer(); diff --git a/tpl/RAJA b/tpl/RAJA index 378199aac..c58f45739 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 378199aac342ee21c2ddfbcbb48413bd1dfac612 +Subproject commit c58f457397bf3c96af5e53dd69d8d55f5561c703