Unverified Commit bd3b7129 authored by Axel Kohlmeyer's avatar Axel Kohlmeyer Committed by GitHub
Browse files

Merge pull request #1263 from stanmoore1/kk_snap

Optimize Kokkos SNAP energy calculation
parents 037cdfe0 e5b86910
Loading
Loading
Loading
Loading
+24 −20
Original line number Diff line number Diff line
@@ -256,7 +256,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  if (vflag_fdotr) pair_virial_fdotr_compute(this);


  if (eflag_atom) {
    k_eatom.template modify<DeviceType>();
    k_eatom.template sync<LMPHostType>();
@@ -453,6 +452,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
  //t4 += timer.seconds(); timer.reset();
  team.team_barrier();

  if (quadraticflag) {
    my_sna.compute_bi(team);
    team.team_barrier();
    my_sna.copy_bi2bvec(team);
    team.team_barrier();
  }

  // for neighbors of I within cutoff:
  // compute dUi/drj and dBi/drj
  // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
@@ -472,10 +478,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
    my_sna.compute_dbidrj(team);
    //t7 += timer2.seconds(); timer2.reset();
    my_sna.copy_dbi2dbvec(team);
    if (quadraticflag) {
      my_sna.compute_bi(team);
      my_sna.copy_bi2bvec(team);
    }

    Kokkos::single(Kokkos::PerThread(team), [&] (){
    F_FLOAT fij[3];
@@ -536,10 +538,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
    a_f(j,1) -= fij[1];
    a_f(j,2) -= fij[2];

    // tally per-atom virial contribution
    // tally global and per-atom virial contribution

    if (EVFLAG) {
      if (vflag) {
      if (vflag_either) {
        v_tally_xyz<NEIGHFLAG>(ev,i,j,
          fij[0],fij[1],fij[2],
          -my_sna.rij(jj,0),-my_sna.rij(jj,1),
@@ -554,11 +556,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
  // tally energy contribution

  if (EVFLAG) {
    if (eflag) {
    if (eflag_either) {

      if (!quadraticflag) {
        my_sna.compute_bi(team);
        team.team_barrier();
        my_sna.copy_bi2bvec(team);
        team.team_barrier();
      }

      // E = beta.B + 0.5*B^t.alpha.B
@@ -566,14 +570,15 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
      // coeff[k] = alpha_ii or
      // coeff[k] = alpha_ij = alpha_ji, j != i

      if (team.team_rank() == 0)
      Kokkos::single(Kokkos::PerThread(team), [&] () {
      Kokkos::single(Kokkos::PerTeam(team), [&] () {

        // evdwl = energy of atom I, sum over coeffs_k * Bi_k

        double evdwl = d_coeffi[0];

        // linear contributions
        // could use thread vector range on this loop

        for (int k = 1; k <= ncoeff; k++)
          evdwl += d_coeffi[k]*my_sna.bvec[k-1];

@@ -589,11 +594,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
            }
          }
        }

        //ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
        if (eflag_either) {
        if (eflag_global) ev.evdwl += evdwl;
        if (eflag_atom) d_eatom[i] += evdwl;
        }
      });
    }
  }
+33 −20
Original line number Diff line number Diff line
@@ -327,29 +327,40 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max),
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxj_full_max),
      [&] (const int& idx) {
    const int j1 = idxj(idx).j1;
    const int j2 = idxj(idx).j2;
    const int j =  idxj(idx).j;
    double b_j1_j2_j = 0.0;
    const int j1 = idxj_full(idx).j1;
    const int j2 = idxj_full(idx).j2;
    const int j =  idxj_full(idx).j;

    for(int mb = 0; 2*mb < j; mb++)
      for(int ma = 0; ma <= j; ma++) {
        b_j1_j2_j +=
    const int bound = (j+2)/2;
    double b_j1_j2_j = 0.0;
    double b_j1_j2_j_temp = 0.0;
    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
        [&] (const int mbma, double& sum) {
        //for(int mb = 0; 2*mb <= j; mb++)
          //for(int ma = 0; ma <= j; ma++) {
        const int ma = mbma%(j+1);
        const int mb = mbma/(j+1);
        if (2*mb == j) return;
        sum +=
          uarraytot_r(j,ma,mb) * zarray_r(j1,j2,j,mb,ma) +
          uarraytot_i(j,ma,mb) * zarray_i(j1,j2,j,mb,ma);
      } // end loop over ma, mb
      },b_j1_j2_j_temp); // end loop over ma, mb
      b_j1_j2_j += b_j1_j2_j_temp;

    // For j even, special treatment for middle column

    if (j%2 == 0) {
      const int mb = j/2;
      for(int ma = 0; ma < mb; ma++) {
        b_j1_j2_j +=
      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
          [&] (const int ma, double& sum) {
      //for(int ma = 0; ma < mb; ma++) {
        sum +=
          uarraytot_r(j,ma,mb) * zarray_r(j1,j2,j,mb,ma) +
          uarraytot_i(j,ma,mb) * zarray_i(j1,j2,j,mb,ma);
      }
      },b_j1_j2_j_temp); // end loop over ma
      b_j1_j2_j += b_j1_j2_j_temp;

      const int ma = mb;
      b_j1_j2_j +=
@@ -357,11 +368,13 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
         uarraytot_i(j,ma,mb) * zarray_i(j1,j2,j,mb,ma))*0.5;
    }

    Kokkos::single(Kokkos::PerThread(team), [&] () {
      b_j1_j2_j *= 2.0;
      if (bzero_flag)
        b_j1_j2_j -= bzero[j];

      barray(j1,j2,j) = b_j1_j2_j;
    });
  });
      //} // end loop over j
    //} // end loop over j1, j2
@@ -1028,6 +1041,8 @@ void SNAKokkos<DeviceType>::create_team_scratch_arrays(const typename Kokkos::Te
  uarraytot_i_a = uarraytot_i = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim);
  zarray_r = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim);
  zarray_i = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim);
  bvec = Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>(team.team_scratch(1),ncoeff);
  barray = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim);

  rij = t_sna_2d(team.team_scratch(1),nmax,3);
  rcutij = t_sna_1d(team.team_scratch(1),nmax);
@@ -1046,6 +1061,8 @@ T_INT SNAKokkos<DeviceType>::size_team_scratch_arrays() {
  size += t_sna_3d::shmem_size(jdim,jdim,jdim); // uarraytot_i_a
  size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); // zarray_r
  size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); // zarray_i
  size += Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>::shmem_size(ncoeff); // bvec
  size += t_sna_3d::shmem_size(jdim,jdim,jdim); // barray

  size += t_sna_2d::shmem_size(nmax,3); // rij
  size += t_sna_1d::shmem_size(nmax); // rcutij
@@ -1062,8 +1079,6 @@ KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team)
{
  int jdim = twojmax + 1;
  bvec = Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>(team.thread_scratch(1),ncoeff);
  barray = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim);

  dbvec = Kokkos::View<double*[3], Kokkos::LayoutRight, DeviceType>(team.thread_scratch(1),ncoeff);
  dbarray = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim);
@@ -1079,8 +1094,6 @@ inline
T_INT SNAKokkos<DeviceType>::size_thread_scratch_arrays() {
  T_INT size = 0;
  int jdim = twojmax + 1;
  size += Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>::shmem_size(ncoeff); // bvec
  size += t_sna_3d::shmem_size(jdim,jdim,jdim); // barray

  size += Kokkos::View<double*[3], Kokkos::LayoutRight, DeviceType>::shmem_size(ncoeff); // dbvec
  size += t_sna_4d::shmem_size(jdim,jdim,jdim); // dbarray