From b9e5af682262c3b3008c47892ca27be8386b06e3 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Thu, 25 May 2023 22:07:49 -0400 Subject: [PATCH 01/11] Revert "Revert "get rid of single_bond_grad, attempted transpose (unhelpful), added off-diagonal setflag, need cleaning"" This reverts commit 700f2c86376a5f439483b55b7b0b2129fd0ca48e. --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 147 +++++++++++++++----- lammps_plugins/kokkos/pair_flare_kokkos.h | 5 + lammps_plugins/pair_flare.cpp | 4 +- 3 files changed, 121 insertions(+), 35 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index 76653df28..0e4d31ee7 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -193,9 +193,9 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) + 2 // evdwls, B2_norm2s + 0.5 // numneigh_short + max_neighs * ( - n_max*4 // g - + n_harmonics*4 // Y - + n_max*n_harmonics*3 // single_bond_grad + n_max*4 // g and gT + + n_harmonics*4 // Y and YT + //+ n_max*n_harmonics*3 // single_bond_grad + 3 // partial_forces + 0.5 // neighs_short ) @@ -258,8 +258,12 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) g_ra = g; Y_ra = Y; - single_bond_grad = View5D(); - single_bond_grad = View5D(Kokkos::ViewAllocateWithoutInitializing("FLARE: single_bond_grad"), batch_size, max_neighs, 3, n_max, n_harmonics); + //gT = View4D(); YT = View4D(); + //gT = View4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: gT"), batch_size, max_neighs, 4, n_max); + //YT = View4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: YT"), batch_size, max_neighs, 4, n_harmonics); + + // single_bond_grad = View5D(); + // single_bond_grad = View5D(Kokkos::ViewAllocateWithoutInitializing("FLARE: single_bond_grad"), batch_size, max_neighs, 3, n_max, n_harmonics); partial_forces = View3D(); partial_forces = View3D(Kokkos::ViewAllocateWithoutInitializing("FLARE: partial forces"), batch_size, max_neighs, 3); @@ -277,14 +281,25 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) *this ); + // transpose R and Y for later use + int g_size = ScratchView3D::shmem_size(n_max, 4, max_neighs); + int Y_size = ScratchView3D::shmem_size(n_harmonics, 4, max_neighs); + /* + auto transpolicy = Kokkos::TeamPolicy(batch_size, SINGLE_BOND_TEAM_SIZE, vector_length).set_scratch_size( + 0, Kokkos::PerTeam(g_size + Y_size)); + Kokkos::parallel_for("FLARE: transpose R and Y", + transpolicy, + *this + ); + */ + // compute single bond and its gradient // dnlm, dnlmj - int g_size = ScratchView2D::shmem_size(n_max, 4); - int Y_size = ScratchView2D::shmem_size(n_harmonics, 4); + g_size = ScratchView1D::shmem_size(n_max); + Y_size = ScratchView1D::shmem_size(n_harmonics); auto policy = Kokkos::TeamPolicy(batch_size, SINGLE_BOND_TEAM_SIZE, vector_length).set_scratch_size( 0, Kokkos::PerThread(g_size + Y_size)); Kokkos::deep_copy(single_bond, 0.0); - //Kokkos::deep_copy(single_bond_grad, 0.0); Kokkos::parallel_for("FLARE: single bond", policy, *this @@ -328,11 +343,15 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) ); // compute partial forces + g_size = ScratchView2D::shmem_size(4, n_max); + Y_size = ScratchView2D::shmem_size(4, n_harmonics); int u_size = ScratchView2D::shmem_size(n_radial, n_harmonics); Kokkos::parallel_for("FLARE: partial forces", Kokkos::TeamPolicy(batch_size, TEAM_SIZE, vector_length).set_scratch_size( 0, Kokkos::PerTeam(u_size) - ), + )/*.set_scratch_size( + 0, Kokkos::PerThread(g_size + Y_size) + )*/, *this ); @@ -411,34 +430,84 @@ void PairFLAREKokkos::operator()(const int ii, const int jj) const { template KOKKOS_INLINE_FUNCTION -void PairFLAREKokkos::operator()(TagSingleBond, const MemberType team_member) const{ +void PairFLAREKokkos::operator()(TagTransposeRY, const MemberType team_member) const{ int ii = team_member.league_rank(); const int jnum = d_numneigh_short(ii); - ScratchView2D gscratch(team_member.thread_scratch(0), 4, n_max); - ScratchView2D Yscratch(team_member.thread_scratch(0), 4, n_harmonics); + ScratchView3D gscratch(team_member.team_scratch(0), 4, n_max, max_neighs); + ScratchView3D Yscratch(team_member.team_scratch(0), 4, n_harmonics, max_neighs); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int jj){ - int j = d_neighbors_short(ii,jj); - j &= NEIGHMASK; - int s = type[j] - 1; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_max), [&] (int nc){ + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ + //int n = nc / 4; + //int c = nc -4*n; + int c = nc / n_max; + int n = nc - c*n_max; + gscratch(c, n, jj) = g(ii, jj, n, c); + }); + }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_harmonics), [&] (int lmc){ + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ + //int lm = lmc / 4; + //int c = lmc - 4 * lm; + int c = lmc / n_harmonics; + int lm = lmc - c*n_harmonics; + Yscratch(c, lm, jj) = Y(ii, jj, lm, c); + }); + }); + team_member.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int jj){ Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_max), [&] (int nc){ //int n = nc / 4; //int c = nc -4*n; int c = nc / n_max; int n = nc - c*n_max; - gscratch(c, n) = g_ra(ii, jj, n, c); + gT(ii, jj, c, n) = gscratch(c, n, jj); }); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_harmonics), [&] (int lmc){ //int lm = lmc / 4; //int c = lmc - 4 * lm; int c = lmc / n_harmonics; int lm = lmc - c*n_harmonics; - Yscratch(c, lm) = Y_ra(ii, jj, lm, c); + YT(ii, jj, c, lm) = Yscratch(c, lm, jj); + }); + }); +} + +template +KOKKOS_INLINE_FUNCTION +void PairFLAREKokkos::operator()(TagSingleBond, const MemberType team_member) const{ + int ii = team_member.league_rank(); + + const int jnum = d_numneigh_short(ii); + + ScratchView1D gscratch(team_member.thread_scratch(0), n_max); + ScratchView1D Yscratch(team_member.thread_scratch(0), n_harmonics); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int jj){ + + int j = d_neighbors_short(ii,jj); + j &= NEIGHMASK; + int s = type[j] - 1; + + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_max), [&] (int n){ + //int n = nc / 4; + //int c = nc -4*n; + // int c = nc / n_max; + // int n = nc - c*n_max; + gscratch(n) = g(ii, jj, n, 0); + }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_harmonics), [&] (int lm){ + //int lm = lmc / 4; + //int c = lmc - 4 * lm; + // int c = lmc / n_harmonics; + // int lm = lmc - c*n_harmonics; + Yscratch(lm) = Y(ii, jj, lm, 0); }); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_max*n_harmonics), [&] (int nlm){ @@ -446,28 +515,28 @@ void PairFLAREKokkos::operator()(TagSingleBond, const MemberType tea int lm = nlm - n_harmonics*n; int radial_index = s*n_max + n; - double g_val = gscratch(0,n); - double gx_val = gscratch(1,n); - double gy_val = gscratch(2,n); - double gz_val = gscratch(3,n); + double g_val = gscratch(n); + // double gx_val = gscratch(1,n); + // double gy_val = gscratch(2,n); + // double gz_val = gscratch(3,n); - double h_val = Yscratch(0,lm); - double hx_val = Yscratch(1,lm); - double hy_val = Yscratch(2,lm); - double hz_val = Yscratch(3,lm); + double h_val = Yscratch(lm); + // double hx_val = Yscratch(1,lm); + // double hy_val = Yscratch(2,lm); + // double hz_val = Yscratch(3,lm); double bond = g_val * h_val; - double bond_x = gx_val * h_val + g_val * hx_val; - double bond_y = gy_val * h_val + g_val * hy_val; - double bond_z = gz_val * h_val + g_val * hz_val; + // double bond_x = gx_val * h_val + g_val * hx_val; + // double bond_y = gy_val * h_val + g_val * hy_val; + // double bond_z = gz_val * h_val + g_val * hz_val; // Update single bond basis arrays. Kokkos::atomic_add(&single_bond(ii, radial_index, lm),bond); // TODO: bad? - single_bond_grad(ii,jj,0,n,lm) = bond_x; - single_bond_grad(ii,jj,1,n,lm) = bond_y; - single_bond_grad(ii,jj,2,n,lm) = bond_z; + // single_bond_grad(ii,jj,0,n,lm) = bond_x; + // single_bond_grad(ii,jj,1,n,lm) = bond_y; + // single_bond_grad(ii,jj,2,n,lm) = bond_z; }); }); } @@ -590,6 +659,9 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) const int i = ilist_curr_type[ii+startatom]; const int jnum = d_numneigh_short(ii); + //ScratchView2D gscratch(team_member.thread_scratch(0), 4, n_max); + //ScratchView2D Yscratch(team_member.thread_scratch(0), 4, n_harmonics); + ScratchView2D uscratch(team_member.team_scratch(0), n_radial, n_harmonics); Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_bond), [&] (int nlm){ int n = nlm / n_harmonics; @@ -611,7 +683,16 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) int n = nlm / n_harmonics; int lm = nlm - n*n_harmonics; int radial_index = s*n_max + n; - tmp += single_bond_grad(ii, jj, c, n, lm)*uscratch(radial_index, lm); + + double gval = g(ii, jj, n, 0); + double gg = g(ii, jj, n, c+1); + + double Yval = Y(ii, jj, lm, 0); + double Yg = Y(ii, jj, lm, c+1); + + tmp += (gg*Yval + gval*Yg) * uscratch(radial_index, lm); + + // tmp += single_bond_grad(ii, jj, c, n, lm)*uscratch(radial_index, lm); }, tmp); partial_forces(ii,jj,c) = tmp; }); diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.h b/lammps_plugins/kokkos/pair_flare_kokkos.h index ee8853057..32cc2403c 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.h +++ b/lammps_plugins/kokkos/pair_flare_kokkos.h @@ -35,6 +35,7 @@ struct Tagw{}; struct Tagu{}; struct TagF{}; struct TagStoreF{}; +struct TagTransposeRY{}; namespace LAMMPS_NS { @@ -57,6 +58,9 @@ class PairFLAREKokkos : public PairFLARE { KOKKOS_INLINE_FUNCTION void operator()(TagFindCurrType, const int) const; + KOKKOS_INLINE_FUNCTION + void operator()(TagTransposeRY, const MemberType) const; + KOKKOS_INLINE_FUNCTION void operator()(TagSingleBond, const MemberType) const; @@ -136,6 +140,7 @@ class PairFLAREKokkos : public PairFLARE { View3D beta, single_bond, u, partial_forces; gYView4D g, Y; gYView4DRA g_ra, Y_ra; + View4D gT, YT; View5D single_bond_grad; int B2_chunk_size; diff --git a/lammps_plugins/pair_flare.cpp b/lammps_plugins/pair_flare.cpp index 387c87800..e1b13352b 100644 --- a/lammps_plugins/pair_flare.cpp +++ b/lammps_plugins/pair_flare.cpp @@ -195,7 +195,8 @@ void PairFLARE::allocate() { // Set the diagonal of setflag to 1 (otherwise pair.cpp will throw an error) for (int i = 1; i <= n; i++) - setflag[i][i] = 1; + for(int j = 1; j <= n; j++) + setflag[i][j] = 1; } /* ---------------------------------------------------------------------- @@ -210,7 +211,6 @@ void PairFLARE::settings(int narg, char ** /*arg*/) { /* ---------------------------------------------------------------------- set coeffs for one or more type pairs - read DYNAMO funcfl file ------------------------------------------------------------------------- */ void PairFLARE::coeff(int narg, char **arg) { From db64484dba3f51d70df9aa91c72abda192332ffa Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Fri, 26 May 2023 12:42:17 -0700 Subject: [PATCH 02/11] add shared mem to partial_force --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 82 +++++++++++++++------ 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index 0e4d31ee7..19d5cca37 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -349,9 +349,9 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("FLARE: partial forces", Kokkos::TeamPolicy(batch_size, TEAM_SIZE, vector_length).set_scratch_size( 0, Kokkos::PerTeam(u_size) - )/*.set_scratch_size( + ).set_scratch_size( 0, Kokkos::PerThread(g_size + Y_size) - )*/, + ), *this ); @@ -440,20 +440,20 @@ void PairFLAREKokkos::operator()(TagTransposeRY, const MemberType te Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_max), [&] (int nc){ + int c = nc / n_max; + int n = nc - c*n_max; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ //int n = nc / 4; //int c = nc -4*n; - int c = nc / n_max; - int n = nc - c*n_max; gscratch(c, n, jj) = g(ii, jj, n, c); }); }); Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_harmonics), [&] (int lmc){ + int c = lmc / n_harmonics; + int lm = lmc - c*n_harmonics; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ //int lm = lmc / 4; //int c = lmc - 4 * lm; - int c = lmc / n_harmonics; - int lm = lmc - c*n_harmonics; Yscratch(c, lm, jj) = Y(ii, jj, lm, c); }); }); @@ -500,6 +500,7 @@ void PairFLAREKokkos::operator()(TagSingleBond, const MemberType tea //int c = nc -4*n; // int c = nc / n_max; // int n = nc - c*n_max; + //gscratch(n) = gT(ii, jj, 0, n); gscratch(n) = g(ii, jj, n, 0); }); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_harmonics), [&] (int lm){ @@ -507,6 +508,7 @@ void PairFLAREKokkos::operator()(TagSingleBond, const MemberType tea //int c = lmc - 4 * lm; // int c = lmc / n_harmonics; // int lm = lmc - c*n_harmonics; + //Yscratch(lm) = YT(ii, jj, 0, lm); Yscratch(lm) = Y(ii, jj, lm, 0); }); @@ -659,8 +661,8 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) const int i = ilist_curr_type[ii+startatom]; const int jnum = d_numneigh_short(ii); - //ScratchView2D gscratch(team_member.thread_scratch(0), 4, n_max); - //ScratchView2D Yscratch(team_member.thread_scratch(0), 4, n_harmonics); + ScratchView2D gscratch(team_member.thread_scratch(0), 4, n_max); + ScratchView2D Yscratch(team_member.thread_scratch(0), 4, n_harmonics); ScratchView2D uscratch(team_member.team_scratch(0), n_radial, n_harmonics); Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_bond), [&] (int nlm){ @@ -670,31 +672,63 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) }); team_member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 3*jnum), [&] (int &k){ - int jj = k/3; - int c = k - 3*jj; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int &k){ + int jj = k; + //int jj = k/3; + //int c = k - 3*jj; int j = d_neighbors_short(ii,jj); j &= NEIGHMASK; int s = type[j] - 1; - F_FLOAT tmp = 0.0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team_member, n_max*n_harmonics), [&](int nlm, F_FLOAT &tmp){ - int n = nlm / n_harmonics; - int lm = nlm - n*n_harmonics; - int radial_index = s*n_max + n; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_max), [&] (int nc){ + //int n = nc / 4; + //int c = nc -4*n; + int c = nc / n_max; + int n = nc - c*n_max; + gscratch(c, n) = g(ii, jj, n, c); + //gscratch(c, n) = gT(ii, jj, c, n); + }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_harmonics), [&] (int lmc){ + //int lm = lmc / 4; + //int c = lmc - 4 * lm; + int c = lmc / n_harmonics; + int lm = lmc - c*n_harmonics; + Yscratch(c, lm) = Y(ii, jj, lm, c); + //Yscratch(c, lm) = YT(ii, jj, c, lm); + }); + + for (int c = 0; c < 3; c++) { + F_FLOAT tmp = 0.0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team_member, n_max*n_harmonics), [&](int nlm, F_FLOAT &tmp){ + int n = nlm / n_harmonics; + int lm = nlm - n*n_harmonics; + int radial_index = s*n_max + n; - double gval = g(ii, jj, n, 0); - double gg = g(ii, jj, n, c+1); + double gval = gscratch(0, n); + double gg = gscratch(c+1, n); - double Yval = Y(ii, jj, lm, 0); - double Yg = Y(ii, jj, lm, c+1); + double Yval = Yscratch(0, lm); + double Yg = Yscratch(c+1, lm); - tmp += (gg*Yval + gval*Yg) * uscratch(radial_index, lm); + // double gval = gT(ii, jj, 0, n); + // double gg = gT(ii, jj, c+1, n); - // tmp += single_bond_grad(ii, jj, c, n, lm)*uscratch(radial_index, lm); - }, tmp); - partial_forces(ii,jj,c) = tmp; + // double Yval = YT(ii, jj, 0, lm); + // double Yg = YT(ii, jj, c+1, lm); + + // double gval = g(ii, jj, n, 0); + // double gg = g(ii, jj, n, c+1); + + // double Yval = Y(ii, jj, lm, 0); + // double Yg = Y(ii, jj, lm, c+1); + + tmp += (gg*Yval + gval*Yg) * uscratch(radial_index, lm); + + // tmp += single_bond_grad(ii, jj, c, n, lm)*uscratch(radial_index, lm); + }, tmp); + partial_forces(ii,jj,c) = tmp; + } }); } From ec38449d27176d93b1359e9351009b592382d584 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Fri, 26 May 2023 13:59:19 -0700 Subject: [PATCH 03/11] cleanup, remove transposet and betaB2 --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 204 ++------------------ lammps_plugins/kokkos/pair_flare_kokkos.h | 6 - 2 files changed, 14 insertions(+), 196 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index 19d5cca37..e8049b1f3 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -193,9 +193,8 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) + 2 // evdwls, B2_norm2s + 0.5 // numneigh_short + max_neighs * ( - n_max*4 // g and gT - + n_harmonics*4 // Y and YT - //+ n_max*n_harmonics*3 // single_bond_grad + n_max*4 // g + + n_harmonics*4 // Y + 3 // partial_forces + 0.5 // neighs_short ) @@ -255,15 +254,7 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) g = gYView4D(); Y = gYView4D(); g = gYView4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: g"), glayout); Y = gYView4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: Y"), Ylayout); - g_ra = g; - Y_ra = Y; - //gT = View4D(); YT = View4D(); - //gT = View4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: gT"), batch_size, max_neighs, 4, n_max); - //YT = View4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: YT"), batch_size, max_neighs, 4, n_harmonics); - - // single_bond_grad = View5D(); - // single_bond_grad = View5D(Kokkos::ViewAllocateWithoutInitializing("FLARE: single_bond_grad"), batch_size, max_neighs, 3, n_max, n_harmonics); partial_forces = View3D(); partial_forces = View3D(Kokkos::ViewAllocateWithoutInitializing("FLARE: partial forces"), batch_size, max_neighs, 3); @@ -281,22 +272,9 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) *this ); - // transpose R and Y for later use - int g_size = ScratchView3D::shmem_size(n_max, 4, max_neighs); - int Y_size = ScratchView3D::shmem_size(n_harmonics, 4, max_neighs); - /* - auto transpolicy = Kokkos::TeamPolicy(batch_size, SINGLE_BOND_TEAM_SIZE, vector_length).set_scratch_size( - 0, Kokkos::PerTeam(g_size + Y_size)); - Kokkos::parallel_for("FLARE: transpose R and Y", - transpolicy, - *this - ); - */ - - // compute single bond and its gradient - // dnlm, dnlmj - g_size = ScratchView1D::shmem_size(n_max); - Y_size = ScratchView1D::shmem_size(n_harmonics); + // compute single bond dnlm = RjnYjlm + int g_size = ScratchView1D::shmem_size(n_max); + int Y_size = ScratchView1D::shmem_size(n_harmonics); auto policy = Kokkos::TeamPolicy(batch_size, SINGLE_BOND_TEAM_SIZE, vector_length).set_scratch_size( 0, Kokkos::PerThread(g_size + Y_size)); Kokkos::deep_copy(single_bond, 0.0); @@ -313,20 +291,8 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) *this ); - // compute beta*B2 - if(n_species>0){ - KokkosBlas::gemm("N", "T", 1.0, B2, Kokkos::subview(beta, curr_type, Kokkos::ALL(), Kokkos::ALL()), 0.0, beta_B2); - } - else{ - B2_chunk_size = std::min(1000, n_descriptors); - int B2_size = ScratchView1D::shmem_size(B2_chunk_size); - Kokkos::parallel_for("FLARE: beta*B2", - Kokkos::TeamPolicy(batch_size, TEAM_SIZE, vector_length).set_scratch_size( - 0, Kokkos::PerTeam(B2_size) - ), - *this - ); - } + // compute beta*B2 + KokkosBlas::gemm("N", "T", 1.0, B2, Kokkos::subview(beta, curr_type, Kokkos::ALL(), Kokkos::ALL()), 0.0, beta_B2); // compute B2 squared norms and evdwls and w Kokkos::parallel_for("FLARE: B2 norm2 evdwl w", @@ -428,56 +394,6 @@ void PairFLAREKokkos::operator()(const int ii, const int jj) const { */ } -template -KOKKOS_INLINE_FUNCTION -void PairFLAREKokkos::operator()(TagTransposeRY, const MemberType team_member) const{ - int ii = team_member.league_rank(); - - const int jnum = d_numneigh_short(ii); - - ScratchView3D gscratch(team_member.team_scratch(0), 4, n_max, max_neighs); - ScratchView3D Yscratch(team_member.team_scratch(0), 4, n_harmonics, max_neighs); - - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_max), [&] (int nc){ - int c = nc / n_max; - int n = nc - c*n_max; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ - //int n = nc / 4; - //int c = nc -4*n; - gscratch(c, n, jj) = g(ii, jj, n, c); - }); - }); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, 4*n_harmonics), [&] (int lmc){ - int c = lmc / n_harmonics; - int lm = lmc - c*n_harmonics; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, jnum), [&] (int jj){ - //int lm = lmc / 4; - //int c = lmc - 4 * lm; - Yscratch(c, lm, jj) = Y(ii, jj, lm, c); - }); - }); - team_member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int jj){ - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_max), [&] (int nc){ - //int n = nc / 4; - //int c = nc -4*n; - int c = nc / n_max; - int n = nc - c*n_max; - gT(ii, jj, c, n) = gscratch(c, n, jj); - }); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_harmonics), [&] (int lmc){ - //int lm = lmc / 4; - //int c = lmc - 4 * lm; - int c = lmc / n_harmonics; - int lm = lmc - c*n_harmonics; - YT(ii, jj, c, lm) = Yscratch(c, lm, jj); - }); - }); -} - template KOKKOS_INLINE_FUNCTION void PairFLAREKokkos::operator()(TagSingleBond, const MemberType team_member) const{ @@ -494,21 +410,10 @@ void PairFLAREKokkos::operator()(TagSingleBond, const MemberType tea j &= NEIGHMASK; int s = type[j] - 1; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_max), [&] (int n){ - //int n = nc / 4; - //int c = nc -4*n; - // int c = nc / n_max; - // int n = nc - c*n_max; - //gscratch(n) = gT(ii, jj, 0, n); gscratch(n) = g(ii, jj, n, 0); }); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, n_harmonics), [&] (int lm){ - //int lm = lmc / 4; - //int c = lmc - 4 * lm; - // int c = lmc / n_harmonics; - // int lm = lmc - c*n_harmonics; - //Yscratch(lm) = YT(ii, jj, 0, lm); Yscratch(lm) = Y(ii, jj, lm, 0); }); @@ -518,27 +423,14 @@ void PairFLAREKokkos::operator()(TagSingleBond, const MemberType tea int radial_index = s*n_max + n; double g_val = gscratch(n); - // double gx_val = gscratch(1,n); - // double gy_val = gscratch(2,n); - // double gz_val = gscratch(3,n); - - double h_val = Yscratch(lm); - // double hx_val = Yscratch(1,lm); - // double hy_val = Yscratch(2,lm); - // double hz_val = Yscratch(3,lm); + double Y_val = Yscratch(lm); - double bond = g_val * h_val; - // double bond_x = gx_val * h_val + g_val * hx_val; - // double bond_y = gy_val * h_val + g_val * hy_val; - // double bond_z = gz_val * h_val + g_val * hz_val; + double bond = g_val * Y_val; // Update single bond basis arrays. - Kokkos::atomic_add(&single_bond(ii, radial_index, lm),bond); // TODO: bad? + Kokkos::atomic_add(&single_bond(ii, radial_index, lm),bond); // TODO: reorder loops? - // single_bond_grad(ii,jj,0,n,lm) = bond_x; - // single_bond_grad(ii,jj,1,n,lm) = bond_y; - // single_bond_grad(ii,jj,2,n,lm) = bond_z; }); }); } @@ -560,52 +452,10 @@ void PairFLAREKokkos::operator()(TagB2, const int ii, const int nnl) B2(ii, nnl) = tmp; } -template -KOKKOS_INLINE_FUNCTION -void PairFLAREKokkos::operator()(TagBetaB2, const MemberType team_member) const{ - int ii = team_member.league_rank(); - const int i = ilist_curr_type[ii+startatom]; - - const int itype = type[i] - 1; - - ScratchView1D B2scratch(team_member.team_scratch(0), B2_chunk_size); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int nnl){ - beta_B2(ii,nnl) = 0.0; - }); - - // do mat-vec product in chunks to enable level 0 scratch - // even when descriptors are too big - for(int starti = 0; starti < n_descriptors; starti += B2_chunk_size){ - int stopi = starti + B2_chunk_size; - stopi = n_descriptors < stopi ? n_descriptors : stopi; -// Kokkos::single(Kokkos::PerTeam(team_member), [&] () { -// if(ii==0) printf("%d %d %d\n", n_descriptors, starti, stopi); -// }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, stopi - starti), [&] (int nnl){ - B2scratch(nnl) = B2(ii, nnl + starti); - }); - team_member.team_barrier(); - - // TODO: team-wise GEMV? - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, n_descriptors), [&] (int x){ - F_FLOAT tmp = 0.0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team_member, stopi - starti), [&](int y, F_FLOAT &tmp){ - tmp += beta(itype, x, y+starti)*B2scratch(y); - }, tmp); - Kokkos::single(Kokkos::PerThread(team_member), [&] () { - beta_B2(ii, x) += tmp; - }); - }); - team_member.team_barrier(); - } -} - template KOKKOS_INLINE_FUNCTION void PairFLAREKokkos::operator()(TagNorm2, const MemberType team_member) const{ int ii = team_member.league_rank(); - double empty_thresh = 1e-8; F_FLOAT tmp = 0.0; Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x, F_FLOAT &tmp){ @@ -640,7 +490,6 @@ template KOKKOS_INLINE_FUNCTION void PairFLAREKokkos::operator()(Tagu, const int ii, const int n1, const int lm) const{ int l = sqrt(1.0*lm); - //int l = Kokkos::Experimental::sqrt(lm); F_FLOAT un1lm = 0.0; for(int n2 = 0; n2 < n_radial; n2++){ @@ -674,28 +523,21 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, jnum), [&] (int &k){ int jj = k; - //int jj = k/3; - //int c = k - 3*jj; int j = d_neighbors_short(ii,jj); j &= NEIGHMASK; int s = type[j] - 1; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_max), [&] (int nc){ - //int n = nc / 4; - //int c = nc -4*n; int c = nc / n_max; int n = nc - c*n_max; gscratch(c, n) = g(ii, jj, n, c); - //gscratch(c, n) = gT(ii, jj, c, n); + }); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, 4*n_harmonics), [&] (int lmc){ - //int lm = lmc / 4; - //int c = lmc - 4 * lm; int c = lmc / n_harmonics; int lm = lmc - c*n_harmonics; Yscratch(c, lm) = Y(ii, jj, lm, c); - //Yscratch(c, lm) = YT(ii, jj, c, lm); }); for (int c = 0; c < 3; c++) { @@ -711,21 +553,8 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) double Yval = Yscratch(0, lm); double Yg = Yscratch(c+1, lm); - // double gval = gT(ii, jj, 0, n); - // double gg = gT(ii, jj, c+1, n); - - // double Yval = YT(ii, jj, 0, lm); - // double Yg = YT(ii, jj, c+1, lm); - - // double gval = g(ii, jj, n, 0); - // double gg = g(ii, jj, n, c+1); - - // double Yval = Y(ii, jj, lm, 0); - // double Yg = Y(ii, jj, lm, c+1); - tmp += (gg*Yval + gval*Yg) * uscratch(radial_index, lm); - // tmp += single_bond_grad(ii, jj, c, n, lm)*uscratch(radial_index, lm); }, tmp); partial_forces(ii,jj,c) = tmp; } @@ -822,13 +651,8 @@ void PairFLAREKokkos::operator()(const int& ii) const { } -/* ---------------------------------------------------------------------- */ - - - - /* ---------------------------------------------------------------------- - set coeffs for one or more type pairs + read coeff file with settings and beta matrix, copy to device ------------------------------------------------------------------------- */ template @@ -888,12 +712,12 @@ void PairFLAREKokkos::init_style() // always request a full neighbor list - if (neighflag != FULL) { // TODO: figure this out + if (neighflag != FULL) { error->all(FLERR,"Cannot use chosen neighbor list style with pair flare/kk"); } // get available memory from environment variable, - // defaults to 16 GB set in the header file + // defaults to 12 GB set in the header file char *memstr = std::getenv("MAXMEM"); if (memstr != NULL) { maxmem = std::atof(memstr) * 1.0e9; diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.h b/lammps_plugins/kokkos/pair_flare_kokkos.h index 32cc2403c..752106c5f 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.h +++ b/lammps_plugins/kokkos/pair_flare_kokkos.h @@ -58,9 +58,6 @@ class PairFLAREKokkos : public PairFLARE { KOKKOS_INLINE_FUNCTION void operator()(TagFindCurrType, const int) const; - KOKKOS_INLINE_FUNCTION - void operator()(TagTransposeRY, const MemberType) const; - KOKKOS_INLINE_FUNCTION void operator()(TagSingleBond, const MemberType) const; @@ -139,9 +136,6 @@ class PairFLAREKokkos : public PairFLARE { View2D B2, beta_B2, w, cutoff_matrix_k; View3D beta, single_bond, u, partial_forces; gYView4D g, Y; - gYView4DRA g_ra, Y_ra; - View4D gT, YT; - View5D single_bond_grad; int B2_chunk_size; From 02044188bf8348ccfa5796a64b4ac11427b0a4e4 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Thu, 2 May 2024 17:09:37 -0700 Subject: [PATCH 04/11] reduce RY launch bounds to maxshortneigh (and compute this), combined cartesian directions for Fij, combined dot products for B2B2 B2betaB2 --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 138 +++++++++++++++----- 1 file changed, 102 insertions(+), 36 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index e8049b1f3..41089b18a 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -30,6 +30,7 @@ #include "atom_masks.h" #include "math_const.h" #include +#include #include #include @@ -241,34 +242,81 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) d_numneigh_short = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("FLARE::numneighs_short") ,batch_size); } + if(d_neighbors_short.extent(0) < batch_size || d_neighbors_short.extent(1) < max_neighs){ + d_neighbors_short = decltype(d_neighbors_short)(); + d_neighbors_short = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("FLARE::neighbors_short") ,batch_size,max_neighs); + } + // compute short neighbor list + int max_shortneighs; + int64_t nedges; + { + //auto ilist_curr_type = this->ilist_curr_type; + //auto x = this->x; + //auto type = this->type; + //auto d_numneigh = this->d_numneigh; + //auto d_neighbors = this->d_neighbors; + //auto cutoff_matrix_k = this->cutoff + Kokkos::parallel_reduce("FLARE: Short neighlist", Kokkos::RangePolicy(0,batch_size), + KOKKOS_CLASS_LAMBDA(const int ii, int64_t& nedges, int& max_shortneighs){ + const int i = ilist_curr_type[ii+startatom]; + const X_FLOAT xtmp = x(i,0); + const X_FLOAT ytmp = x(i,1); + const X_FLOAT ztmp = x(i,2); + + const int si = type[i] - 1; + + const int jnum = d_numneigh[i]; + int inside = 0; + for (int jj = 0; jj < jnum; jj++) { + int j = d_neighbors(i,jj); + j &= NEIGHMASK; + + const X_FLOAT delx = xtmp - x(j,0); + const X_FLOAT dely = ytmp - x(j,1); + const X_FLOAT delz = ztmp - x(j,2); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + const double paircut = cutoff_matrix_k(si, type[j]-1); + + if (rsq < paircut*paircut) { + d_neighbors_short(ii,inside) = j; + inside++; + } + } + d_numneigh_short(ii) = inside; + nedges += d_numneigh_short(ii); + if (d_numneigh_short(ii) > max_shortneighs) max_shortneighs = d_numneigh_short(ii); + }, + Kokkos::Sum(nedges), + Kokkos::Max(max_shortneighs) + ); + } + + // reallocate per-neighbor views - if(g.extent(0) < batch_size || g.extent(1) < max_neighs){ - Kokkos::LayoutStride glayout(batch_size, max_neighs*n_max*4, - max_neighs, 1, - n_max, 4*max_neighs, - 4, max_neighs); - Kokkos::LayoutStride Ylayout(batch_size, max_neighs*n_harmonics*4, - max_neighs, 1, - n_harmonics, 4*max_neighs, - 4, max_neighs); + if(g.extent(0) < batch_size || g.extent(1) < max_shortneighs){ + Kokkos::LayoutStride glayout(batch_size, max_shortneighs*n_max*4, + max_shortneighs, 1, + n_max, 4*max_shortneighs, + 4, max_shortneighs); + Kokkos::LayoutStride Ylayout(batch_size, max_shortneighs*n_harmonics*4, + max_shortneighs, 1, + n_harmonics, 4*max_shortneighs, + 4, max_shortneighs); g = gYView4D(); Y = gYView4D(); g = gYView4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: g"), glayout); Y = gYView4D(Kokkos::ViewAllocateWithoutInitializing("FLARE: Y"), Ylayout); partial_forces = View3D(); - partial_forces = View3D(Kokkos::ViewAllocateWithoutInitializing("FLARE: partial forces"), batch_size, max_neighs, 3); + partial_forces = View3D(Kokkos::ViewAllocateWithoutInitializing("FLARE: partial forces"), batch_size, max_shortneighs, 3); - d_neighbors_short = decltype(d_neighbors_short)(); - d_neighbors_short = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("FLARE::neighbors_short") ,batch_size,max_neighs); } - // compute short neighbor list - Kokkos::parallel_for("FLARE: Short neighlist", Kokkos::RangePolicy(0,batch_size), *this); // compute basis functions Rn and Ylm Kokkos::parallel_for("FLARE: R and Y", Kokkos::MDRangePolicy>( - {0,0}, {batch_size, max_neighs}, {1,max_neighs}), + {0,0}, {batch_size, max_shortneighs}, {1,max_shortneighs}), *this ); @@ -457,32 +505,41 @@ KOKKOS_INLINE_FUNCTION void PairFLAREKokkos::operator()(TagNorm2, const MemberType team_member) const{ int ii = team_member.league_rank(); - F_FLOAT tmp = 0.0; - Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x, F_FLOAT &tmp){ - tmp += B2(ii, x) * B2(ii, x); - }, tmp); - B2_norm2s(ii) = tmp; + F_FLOAT tmp1 = 0.0; + F_FLOAT tmp2 = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x, F_FLOAT &tmp1, F_FLOAT &tmp2){ + tmp1 += B2(ii, x) * B2(ii, x); + tmp2 += B2(ii, x) * beta_B2(ii, x); + }, tmp1, tmp2); + Kokkos::single(Kokkos::PerTeam(team_member), [&] () { + B2_norm2s(ii) = tmp1; + evdwls(ii) = tmp2/B2_norm2s(ii); + if (d_numneigh_short(ii) == 0) + evdwls(ii) = 0; + if (eflag_atom) { + const int i = ilist_curr_type[ii+startatom]; + d_eatom[i] = evdwls(ii); + } + }); + team_member.team_barrier(); + /* tmp = 0.0; Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x, F_FLOAT &tmp){ tmp += B2(ii, x) * beta_B2(ii, x); }, tmp); evdwls(ii) = tmp/B2_norm2s(ii); + */ if (d_numneigh_short(ii) == 0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x){ w(ii, x) = 0; }); - evdwls(ii) = 0; } else { Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x){ w(ii, x) = 2*(evdwls(ii) * B2(ii,x) - beta_B2(ii,x))/B2_norm2s(ii); }); } - if (eflag_atom){ - const int i = ilist_curr_type[ii+startatom]; - d_eatom[i] = evdwls(ii); - } } @@ -540,25 +597,34 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) Yscratch(c, lm) = Y(ii, jj, lm, c); }); - for (int c = 0; c < 3; c++) { - F_FLOAT tmp = 0.0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team_member, n_max*n_harmonics), [&](int nlm, F_FLOAT &tmp){ + F_FLOAT fx = 0, fy = 0, fz = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team_member, n_max*n_harmonics), [&](int nlm, F_FLOAT &fx, F_FLOAT &fy, F_FLOAT &fz){ int n = nlm / n_harmonics; int lm = nlm - n*n_harmonics; int radial_index = s*n_max + n; + double uval = uscratch(radial_index, lm); + double gval = gscratch(0, n); - double gg = gscratch(c+1, n); + double gx = gscratch(1, n); + double gy = gscratch(2, n); + double gz = gscratch(3, n); double Yval = Yscratch(0, lm); - double Yg = Yscratch(c+1, lm); - - tmp += (gg*Yval + gval*Yg) * uscratch(radial_index, lm); - - }, tmp); - partial_forces(ii,jj,c) = tmp; + double Yx = Yscratch(1, lm); + double Yy = Yscratch(2, lm); + double Yz = Yscratch(3, lm); + + fx += (gx*Yval + gval*Yx) * uval; + fy += (gy*Yval + gval*Yy) * uval; + fz += (gz*Yval + gval*Yz) * uval; + + }, fx, fy,fz); + partial_forces(ii,jj,0) = fx; + partial_forces(ii,jj,1) = fy; + partial_forces(ii,jj,2) = fz; } - }); + ); } template From 51ebe9b623016ea9b3d08e21f024770031409ff3 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Fri, 3 May 2024 13:29:16 -0700 Subject: [PATCH 05/11] attempt reduce register usage a bit in fij --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index 41089b18a..b1de11ce4 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -606,18 +606,20 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) double uval = uscratch(radial_index, lm); double gval = gscratch(0, n); - double gx = gscratch(1, n); - double gy = gscratch(2, n); - double gz = gscratch(3, n); - double Yval = Yscratch(0, lm); - double Yx = Yscratch(1, lm); - double Yy = Yscratch(2, lm); - double Yz = Yscratch(3, lm); - fx += (gx*Yval + gval*Yx) * uval; - fy += (gy*Yval + gval*Yy) * uval; - fz += (gz*Yval + gval*Yz) * uval; + double gg, Yg; + gg = gscratch(1, n); + Yg = Yscratch(1, lm); + fx += (gg*Yval + gval*Yg) * uval; + + gg = gscratch(2, n); + Yg = Yscratch(3, lm); + fy += (gg*Yval + gval*Yg) * uval; + + gg = gscratch(3, n); + Yg = Yscratch(3, lm); + fz += (gg*Yval + gval*Yg) * uval; }, fx, fy,fz); partial_forces(ii,jj,0) = fx; From 35d78bd38bbf3b8360e69bb13fbd9a2eb1b057fe Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 16:10:28 -0400 Subject: [PATCH 06/11] fix y component typo --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index b1de11ce4..8fc826fd3 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -614,7 +614,7 @@ void PairFLAREKokkos::operator()(TagF, const MemberType team_member) fx += (gg*Yval + gval*Yg) * uval; gg = gscratch(2, n); - Yg = Yscratch(3, lm); + Yg = Yscratch(2, lm); fy += (gg*Yval + gval*Yg) * uval; gg = gscratch(3, n); From 1e1d6b97561689988c925b11ab0a7f753249d0b5 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 16:22:14 -0400 Subject: [PATCH 07/11] cleanup --- lammps_plugins/kokkos/pair_flare_kokkos.cpp | 47 +-------------------- 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/lammps_plugins/kokkos/pair_flare_kokkos.cpp b/lammps_plugins/kokkos/pair_flare_kokkos.cpp index 8fc826fd3..70ee08547 100644 --- a/lammps_plugins/kokkos/pair_flare_kokkos.cpp +++ b/lammps_plugins/kokkos/pair_flare_kokkos.cpp @@ -130,8 +130,7 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) // build short neighbor list max_neighs = d_neighbors.extent(1); - // TODO: check inum/ignum here - int n_atoms = neighflag == FULL ? inum : inum; + int n_atoms = inum; #ifdef LMP_KOKKOS_GPU @@ -174,11 +173,6 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) ); auto ilist_curr_type_idx = this->ilist_curr_type_idx; - //Kokkos::parallel_for(1, KOKKOS_LAMBDA(const int ii){ - // printf("curr_type_idx = %d\n", ilist_curr_type_idx(0)); - // }); - - //printf("\n%d %d\n\n", ilist_curr_type_idx, n_atoms_curr_type); } @@ -209,14 +203,9 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) n_batches = std::ceil(1.0*n_atoms_curr_type / approx_batch_size); approx_batch_size = n_atoms_curr_type / n_batches; - - //printf("maxmem = %g | betamem = %g | neighmem = %g | lmp_atom_mem = %g | mem_per_atom = %g | approx_batch_size = %d | n_batches = %d | remainder = %d\n", maxmem, beta_mem, neigh_mem, lmp_atom_mem, mem_per_atom, approx_batch_size, n_batches, n_atoms -n_batches* approx_batch_size); - } int remainder = n_atoms_curr_type - n_batches*approx_batch_size; - - startatom = 0; for(int batch_idx = 0; batch_idx < n_batches; batch_idx++){ batch_size = approx_batch_size + (remainder-- > 0 ? 1 : 0); @@ -248,16 +237,9 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) } // compute short neighbor list int max_shortneighs; - int64_t nedges; { - //auto ilist_curr_type = this->ilist_curr_type; - //auto x = this->x; - //auto type = this->type; - //auto d_numneigh = this->d_numneigh; - //auto d_neighbors = this->d_neighbors; - //auto cutoff_matrix_k = this->cutoff Kokkos::parallel_reduce("FLARE: Short neighlist", Kokkos::RangePolicy(0,batch_size), - KOKKOS_CLASS_LAMBDA(const int ii, int64_t& nedges, int& max_shortneighs){ + KOKKOS_CLASS_LAMBDA(const int ii, int& max_shortneighs){ const int i = ilist_curr_type[ii+startatom]; const X_FLOAT xtmp = x(i,0); const X_FLOAT ytmp = x(i,1); @@ -284,10 +266,8 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) } } d_numneigh_short(ii) = inside; - nedges += d_numneigh_short(ii); if (d_numneigh_short(ii) > max_shortneighs) max_shortneighs = d_numneigh_short(ii); }, - Kokkos::Sum(nedges), Kokkos::Max(max_shortneighs) ); } @@ -312,7 +292,6 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) } - // compute basis functions Rn and Ylm Kokkos::parallel_for("FLARE: R and Y", Kokkos::MDRangePolicy>( @@ -398,15 +377,11 @@ void PairFLAREKokkos::compute(int eflag_in, int vflag_in) if (eflag_atom) { - // if (need_dup) - // Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify(); k_eatom.template sync(); } if (vflag_atom) { - //if (need_dup) - //Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify(); k_vatom.template sync(); } @@ -433,13 +408,6 @@ void PairFLAREKokkos::operator()(const int ii, const int jj) const { calculate_radial_kokkos(ii, jj, g, delx, dely, delz, sqrt(rsq), cutoff_matrix_k(type[i]-1, type[j]-1), n_max); get_Y_kokkos(ii, jj, Y, delx, dely, delz, l_max); - /* - printf("i = %d, j = %d, Y =", i, j); - for(int h = 0; h < n_harmonics; h++){ - printf(" %g", Y(jj, h, 0)); - } - printf("\n"); - */ } template @@ -523,14 +491,6 @@ void PairFLAREKokkos::operator()(TagNorm2, const MemberType team_mem }); team_member.team_barrier(); - /* - tmp = 0.0; - Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x, F_FLOAT &tmp){ - tmp += B2(ii, x) * beta_B2(ii, x); - }, tmp); - evdwls(ii) = tmp/B2_norm2s(ii); - */ - if (d_numneigh_short(ii) == 0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, n_descriptors), [&] (int x){ w(ii, x) = 0; @@ -661,8 +621,6 @@ void PairFLAREKokkos::operator()(TagStoreF, const MemberType team_me const X_FLOAT dely = ytmp - x(j,1); const X_FLOAT delz = ztmp - x(j,2); - //printf("i = %d, j = %d, f = %g %g %g\n", i, j, fx, fy, fz); - if (vflag_either) v_tally(fvtmp.v,i,j,fx,fy,fz,delx,dely,delz); }, fvsum); team_member.team_barrier(); @@ -680,7 +638,6 @@ void PairFLAREKokkos::operator()(TagStoreF, const MemberType team_me ev.v[4] += fvsum.v[4]; ev.v[5] += fvsum.v[5]; } - //printf("i = %d, Fsum = %g %g %g\n", i, fsum.x, fsum.y, fsum.z); }); } From 59489865c4af1193b603f4154fe78a9a285892a8 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 21:08:48 +0000 Subject: [PATCH 08/11] re-enable MPI tests --- tests/test_lammps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_lammps.py b/tests/test_lammps.py index 917aea9fe..c96ce0302 100644 --- a/tests/test_lammps.py +++ b/tests/test_lammps.py @@ -12,7 +12,7 @@ power_list = [1, 2] struc_list = ["random", "isolated"] rootdir = os.getcwd() -n_cpus_list = [1] # [1, 2] +n_cpus_list = [1, 2] @pytest.mark.skipif( @@ -67,7 +67,7 @@ def test_write_potential(n_species, n_types, power, struc, multicut, n_cpus): # Set up LAMMPS calculator. lmp_command = os.environ.get("lmp") - if (n_cpus > 1) and ("mpirun" not in lmp_command) and ("kokkos" not in lmp_command): + if (n_cpus > 1) and ("mpirun" not in lmp_command): lmp_command = f"mpirun -np {n_cpus} {lmp_command}" print(lmp_command) @@ -149,7 +149,7 @@ def test_lammps_uncertainty( os.chdir(rootdir) # Set up LAMMPS calculator. lmp_command = os.environ.get("lmp") - if (n_cpus > 1) and ("mpirun" not in lmp_command) and ("kokkos" not in lmp_command): + if (n_cpus > 1) and ("mpirun" not in lmp_command): lmp_command = f"mpirun -np {n_cpus} {lmp_command}" print(lmp_command) From 63fdb1ab62b690b44ceee55e5edbf41151aef7a5 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 21:10:50 +0000 Subject: [PATCH 09/11] version bump --- flare/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flare/_version.py b/flare/_version.py index 3e8d9f946..bf2561596 100644 --- a/flare/_version.py +++ b/flare/_version.py @@ -1 +1 @@ -__version__ = "1.4.0" +__version__ = "1.4.1" From d1132acb40284d573e42a21c1f135ba6bd9fb687 Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 17:40:56 -0400 Subject: [PATCH 10/11] add MPI installation to CI --- .github/workflows/flare.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/flare.yml b/.github/workflows/flare.yml index f4dd43aa9..cf09ba3df 100644 --- a/.github/workflows/flare.yml +++ b/.github/workflows/flare.yml @@ -38,7 +38,7 @@ jobs: - name: Build run: | - sudo apt install liblapacke liblapacke-dev + sudo apt install liblapacke liblapacke-dev libopenmpi-dev mkdir ${BUILD_DIR} cd ${BUILD_DIR} From da5946d2232f823f4e5f88b6df3d70fd9ebddbce Mon Sep 17 00:00:00 2001 From: Anders Johansson Date: Tue, 11 Jun 2024 17:59:26 -0400 Subject: [PATCH 11/11] disable MPI test for uncertainty --- tests/test_lammps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lammps.py b/tests/test_lammps.py index c96ce0302..eaa1e6e40 100644 --- a/tests/test_lammps.py +++ b/tests/test_lammps.py @@ -135,7 +135,7 @@ def test_write_potential(n_species, n_types, power, struc, multicut, n_cpus): @pytest.mark.parametrize("power", power_list) @pytest.mark.parametrize("struc", struc_list) @pytest.mark.parametrize("multicut", [False, True]) -@pytest.mark.parametrize("n_cpus", n_cpus_list) +@pytest.mark.parametrize("n_cpus", [1]) @pytest.mark.parametrize("kernel_type", ["NormalizedDotProduct", "DotProduct"]) def test_lammps_uncertainty( n_species, n_types, use_map, power, struc, multicut, n_cpus, kernel_type,