Commit 8cc43548 authored by Stan Moore's avatar Stan Moore
Browse files

Optimize Kokkos SNAP energy calculation

parent ad1b1897
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -132,8 +132,10 @@ inline double dist2(double* x,double* y);
  int need_dup;
  Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f;
  Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom;
  Kokkos::Experimental::ScatterView<E_FLOAT*   , typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom;
  Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f;
  Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom;
  Kokkos::Experimental::ScatterView<E_FLOAT*   , typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom;

  friend void pair_virial_fdotr_compute<PairSNAPKokkos>(PairSNAPKokkos*);

+22 −11
Original line number Diff line number Diff line
@@ -174,9 +174,11 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  if (need_dup) {
    dup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
    dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
    dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom);
  } else {
    ndup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
    ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
    ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom);
  }

  /*
@@ -258,6 +260,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)


  if (eflag_atom) {
    if (need_dup)
      Kokkos::Experimental::contribute(d_eatom, dup_eatom);
    k_eatom.template modify<DeviceType>();
    k_eatom.template sync<LMPHostType>();
  }
@@ -277,6 +281,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  if (need_dup) {
    dup_f     = decltype(dup_f)();
    dup_vatom = decltype(dup_vatom)();
    dup_eatom = decltype(dup_eatom)();
  }
}

@@ -453,6 +458,11 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
  //t4 += timer.seconds(); timer.reset();
  team.team_barrier();

  if (quadraticflag) {
    my_sna.compute_bi(team);
    my_sna.copy_bi2bvec(team);
  }

  // for neighbors of I within cutoff:
  // compute dUi/drj and dBi/drj
  // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
@@ -472,10 +482,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
    my_sna.compute_dbidrj(team);
    //t7 += timer2.seconds(); timer2.reset();
    my_sna.copy_dbi2dbvec(team);
    if (quadraticflag) {
      my_sna.compute_bi(team);
      my_sna.copy_bi2bvec(team);
    }

    Kokkos::single(Kokkos::PerThread(team), [&] (){
    F_FLOAT fij[3];
@@ -536,10 +542,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
    a_f(j,1) -= fij[1];
    a_f(j,2) -= fij[2];

    // tally per-atom virial contribution
    // tally global and per-atom virial contribution

    if (EVFLAG) {
      if (vflag) {
      if (vflag_either) {
        v_tally_xyz<NEIGHFLAG>(ev,i,j,
          fij[0],fij[1],fij[2],
          -my_sna.rij(jj,0),-my_sna.rij(jj,1),
@@ -554,7 +560,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
  // tally energy contribution

  if (EVFLAG) {
    if (eflag) {
    if (eflag_either) {

      if (!quadraticflag) {
        my_sna.compute_bi(team);
@@ -566,7 +572,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
      // coeff[k] = alpha_ii or
      // coeff[k] = alpha_ij = alpha_ji, j != i

      if (team.team_rank() == 0)
      Kokkos::single(Kokkos::PerThread(team), [&] () {

      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
@@ -592,7 +597,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const
//        ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
        if (eflag_either) {
          if (eflag_global) ev.evdwl += evdwl;
          if (eflag_atom) d_eatom[i] += evdwl;
          if (eflag_atom) {
            // The eatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
            
            auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom);
            auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();
            a_eatom[i] += evdwl;
          }
        }
      });
    }
+23 −12
Original line number Diff line number Diff line
@@ -327,29 +327,40 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max),
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxj_full_max),
      [&] (const int& idx) {
    const int j1 = idxj(idx).j1;
    const int j2 = idxj(idx).j2;
    const int j =  idxj(idx).j;
    double b_j1_j2_j = 0.0;
    const int j1 = idxj_full(idx).j1;
    const int j2 = idxj_full(idx).j2;
    const int j =  idxj_full(idx).j;

    for(int mb = 0; 2*mb < j; mb++)
      for(int ma = 0; ma <= j; ma++) {
        b_j1_j2_j +=
    const int bound = (j+2)/2;
    double b_j1_j2_j = 0.0;
    double b_j1_j2_j_temp = 0.0;
    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
        [&] (const int mbma, double& sum) {
        //for(int mb = 0; 2*mb <= j; mb++)
          //for(int ma = 0; ma <= j; ma++) {
        const int ma = mbma%(j+1);
        const int mb = mbma/(j+1);
        if (2*mb == j) return;
        sum +=
          uarraytot_r(j,ma,mb) * zarray_r(j1,j2,j,mb,ma) +
          uarraytot_i(j,ma,mb) * zarray_i(j1,j2,j,mb,ma);
      } // end loop over ma, mb
      },b_j1_j2_j_temp); // end loop over ma, mb
      b_j1_j2_j += b_j1_j2_j_temp;

    // For j even, special treatment for middle column

    if (j%2 == 0) {
      const int mb = j/2;
      for(int ma = 0; ma < mb; ma++) {
        b_j1_j2_j +=
      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
          [&] (const int ma, double& sum) {
      //for(int ma = 0; ma < mb; ma++) {
        sum +=
          uarraytot_r(j,ma,mb) * zarray_r(j1,j2,j,mb,ma) +
          uarraytot_i(j,ma,mb) * zarray_i(j1,j2,j,mb,ma);
      }
      },b_j1_j2_j_temp); // end loop over ma
      b_j1_j2_j += b_j1_j2_j_temp;

      const int ma = mb;
      b_j1_j2_j +=