Commit cd61cfe8 authored by Evan Weinberg's avatar Evan Weinberg
Browse files

SNAP optimizations, taking advantage of improved data layouts, scratch memory,...

SNAP optimizations, taking advantage of improved data layouts, scratch memory, and hierarchical parallelism.
parent c5ee78ca
Loading
Loading
Loading
Loading
+18 −2
Original line number Diff line number Diff line
@@ -37,12 +37,16 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiTot{}; // new --- accumulate ulist into ulisttot separately
struct TagPairSNAPComputeUiCPU{}; 
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrj{};
struct TagPairSNAPComputeDeidrjCPU{};

template<class DeviceType>
class PairSNAPKokkos : public PairSNAP {
@@ -74,11 +78,17 @@ public:
  void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPPreUi,const int& ii) const;
  void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const int& ii) const;

@@ -86,7 +96,7 @@ public:
  void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPZeroYi,const int& ii) const;
  void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYi,const int& ii) const;
@@ -94,9 +104,15 @@ public:
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const;

+243 −43
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@
#include "kokkos.h"
#include "sna.h"

#include <chrono>

#define MAXLINE 1024
#define MAXWORD 3

@@ -194,17 +196,25 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  if (beta_max < inum) {
    beta_max = inum;
    d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",inum,ncoeff);
    // changed data layout to reflect new threading pattern
    //d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",inum,ncoeff);
    d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
    d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
  }

  chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
  chunk_size = MIN(chunksize,inum);
  chunk_offset = 0;

  snaKK.grow_rij(chunk_size,max_neighs);

  EV_FLOAT ev;

  // lazy design
  int idxu_max = snaKK.idxu_max;

  int world_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory

    EV_FLOAT ev_tmp;
@@ -217,15 +227,62 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);

    //PreUi
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPPreUi> policy_preui(0,chunk_size);
    {
      int vector_length = 1;
      int team_size = 1;
      if (lmp->kokkos->ngpus != 0) {
        vector_length = 32;
        team_size = 32;//max_neighs;
        int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::team_size_max(*this);
        if (team_size*vector_length > team_size_max)
          team_size = team_size_max/vector_length;
      }
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
      Kokkos::parallel_for("PreUi",policy_preui,*this);
    }

    // ComputeUI
    if (lmp->kokkos->ngpus == 0) { // CPU
      // Run a fused calculation of ulist and accumulation into ulisttot using atomics
      int vector_length = 1;
      int team_size = 1;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
    } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics

      // ComputeUi
      int vector_length = 32;
      int team_size = 4; // need to cap b/c of shared memory reqs
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // 2 is for double buffer
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));

      Kokkos::parallel_for("ComputeUi",policy_ui,*this);

    //Ulisttot transpose
    snaKK.transpose_ulisttot();
      // ComputeUitot
      vector_length = 1;
      team_size = 128;
      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
    }


    //Compute bispectrum
    if (quadraticflag || eflag) {
@@ -237,15 +294,28 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      //ComputeBi
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeBi",policy_bi,*this);

    }

    //Compute beta = dE_i/dB_i for all i in list
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta> policy_beta(chunk_size,team_size,vector_length);
    Kokkos::parallel_for("ComputeBeta",policy_beta,*this);

    //ComputeYi
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(0,chunk_size);
    //ZeroYi
    {
      int vector_length = 1;
      int team_size = 1;
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::team_size_max(*this);

#ifdef KOKKOS_ENABLE_CUDA
      team_size = 128;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
#endif

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
    }

    //ComputeYi
    int idxz_max = snaKK.idxz_max;
@@ -253,12 +323,59 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);

    //ComputeDuidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
      
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      snaKK.set_dir(-1); // technically doesn't do anything
      Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
      int vector_length = 32;
      int team_size = 2; // need to cap b/c of shared memory reqs
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
      // 2 is for double buffer
      typedef Kokkos::View< SNAcomplex*,
                          Kokkos::DefaultExecutionSpace::scratch_memory_space,
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >
            ScratchViewType;

      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      // Need to call three times, once for each direction
      for (int k = 0; k < 3; k++) {
        snaKK.set_dir(k);
        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
      }
    }

    //ComputeDeidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
    
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);

    } else { // GPU, different loop strategy internally

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
      int vector_length = 32; // coalescing disaster right now, will fix later
      int team_size = 8; 
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
    }

    //ComputeForce
    if (eflag) {
@@ -284,6 +401,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    }
    ev += ev_tmp;
    chunk_offset += chunk_size;

  } // end while

  if (need_dup)
@@ -341,18 +459,18 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokk
    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);

  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
    d_beta(ii,icoeff) = d_coeffi[icoeff+1];
    d_beta(icoeff,ii) = d_coeffi[icoeff+1];

  if (quadraticflag) {
    int k = ncoeff+1;
    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
      double bveci = my_sna.blist(ii,icoeff);
      d_beta(ii,icoeff) += d_coeffi[k]*bveci;
      double bveci = my_sna.blist(icoeff,ii);
      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
      k++;
      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
        double bvecj = my_sna.blist(ii,jcoeff);
        d_beta(ii,icoeff) += d_coeffi[k]*bvecj;
        d_beta(ii,jcoeff) += d_coeffi[k]*bveci;
        double bvecj = my_sna.blist(jcoeff,ii);
        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
        k++;
      }
    }
@@ -503,9 +621,14 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const int& ii) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.pre_ui(ii);

  // Extract the atom number
  const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  my_sna.pre_ui(team,ii);
}

template<class DeviceType>
@@ -514,8 +637,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -528,9 +650,54 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const int& ii) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.zero_yi(ii);

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;

  // Extract the number of neighbors neighbor number
  const int ninside = d_ninside(ii);

  my_sna.compute_uitot(team,idx,ii,ninside);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_ui_cpu(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;

  my_sna.zero_yi(idx,ii);
}

template<class DeviceType>
@@ -561,8 +728,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -573,14 +739,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  my_sna.compute_duidrj(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_duidrj_cpu(team,ii,jj);
}


template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -591,6 +774,23 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const type
  my_sna.compute_deidrj(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;
 
  my_sna.compute_deidrj_cpu(team,ii,jj);
}

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
@@ -658,17 +858,17 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
        // linear contributions
        
        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
          evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,icoeff);
          evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);
        
        // quadratic contributions
        
        if (quadraticflag) {
          int k = ncoeff+1;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
            double bveci = my_sna.blist(ii,icoeff);
            double bveci = my_sna.blist(icoeff,ii);
            evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
              double bvecj = my_sna.blist(ii,jcoeff);
              double bvecj = my_sna.blist(jcoeff,ii);
              evdwl += d_coeffi[k++]*bveci*bvecj;
            }
          }
+91 −46

File changed.

Preview size limit exceeded, changes collapsed.

+587 −178

File changed.

Preview size limit exceeded, changes collapsed.