Unverified Commit 7b7ca000 authored by Axel Kohlmeyer's avatar Axel Kohlmeyer Committed by GitHub
Browse files

Merge pull request #1922 from weinbe2/snap-scratch

SNAP optimizations - hierarchical parallelism, scratch memory, data layout transformations
parents 3be6347a 9d7b1563
Loading
Loading
Loading
Loading
+18 −2
Original line number Diff line number Diff line
@@ -37,12 +37,16 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
struct TagPairSNAPComputeUiCPU{}; 
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrj{};
struct TagPairSNAPComputeDeidrjCPU{};

template<class DeviceType>
class PairSNAPKokkos : public PairSNAP {
@@ -74,11 +78,17 @@ public:
  void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPPreUi,const int& ii) const;
  void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const int& ii) const;

@@ -86,7 +96,7 @@ public:
  void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPZeroYi,const int& ii) const;
  void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYi,const int& ii) const;
@@ -94,9 +104,15 @@ public:
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const;

+229 −35
Original line number Diff line number Diff line
@@ -12,7 +12,8 @@
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Christian Trott (SNL), Stan Moore (SNL)
   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
                         Evan Weinberg (NVIDIA)
------------------------------------------------------------------------- */

#include <cmath>
@@ -29,6 +30,7 @@
#include "kokkos.h"
#include "sna.h"


#define MAXLINE 1024
#define MAXWORD 3

@@ -194,7 +196,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  if (beta_max < inum) {
    beta_max = inum;
    d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",inum,ncoeff);
    d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
    d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
  }

@@ -205,6 +207,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  EV_FLOAT ev;

  int idxu_max = snaKK.idxu_max;

  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory

    EV_FLOAT ev_tmp;
@@ -217,15 +221,62 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);

    //PreUi
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPPreUi> policy_preui(0,chunk_size);
    {
      int vector_length = 1;
      int team_size = 1;
      if (lmp->kokkos->ngpus != 0) {
        vector_length = 32;
        team_size = 32;//max_neighs;
        int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::team_size_max(*this);
        if (team_size*vector_length > team_size_max)
          team_size = team_size_max/vector_length;
      }
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
      Kokkos::parallel_for("PreUi",policy_preui,*this);
    }

    // ComputeUI
    if (lmp->kokkos->ngpus == 0) { // CPU
      // Run a fused calculation of ulist and accumulation into ulisttot using atomics
      int vector_length = 1;
      int team_size = 1;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
    } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics

      // ComputeUi
      int vector_length = 32;
      int team_size = 4; // need to cap b/c of shared memory reqs
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // 2 is for double buffer
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));

      Kokkos::parallel_for("ComputeUi",policy_ui,*this);

    //Ulisttot transpose
    snaKK.transpose_ulisttot();
      // ComputeUitot
      vector_length = 1;
      team_size = 128;
      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
    }


    //Compute bispectrum
    if (quadraticflag || eflag) {
@@ -237,15 +288,28 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      //ComputeBi
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeBi",policy_bi,*this);

    }

    //Compute beta = dE_i/dB_i for all i in list
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta> policy_beta(chunk_size,team_size,vector_length);
    Kokkos::parallel_for("ComputeBeta",policy_beta,*this);

    //ComputeYi
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(0,chunk_size);
    //ZeroYi
    {
      int vector_length = 1;
      int team_size = 1;
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::team_size_max(*this);

#ifdef KOKKOS_ENABLE_CUDA
      team_size = 128;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
#endif

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
    }

    //ComputeYi
    int idxz_max = snaKK.idxz_max;
@@ -253,12 +317,59 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);

    //ComputeDuidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      snaKK.set_dir(-1); // technically doesn't do anything
      Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
      int vector_length = 32;
      int team_size = 2; // need to cap b/c of shared memory reqs
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
      // 2 is for double buffer
      typedef Kokkos::View< SNAcomplex*,
                          Kokkos::DefaultExecutionSpace::scratch_memory_space,
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >
            ScratchViewType;

      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      // Need to call three times, once for each direction
      for (int k = 0; k < 3; k++) {
        snaKK.set_dir(k);
        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
      }
    }

    //ComputeDeidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);

    } else { // GPU, different loop strategy internally

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
      int vector_length = 32; // coalescing disaster right now, will fix later
      int team_size = 8;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
    }

    //ComputeForce
    if (eflag) {
@@ -284,6 +395,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    }
    ev += ev_tmp;
    chunk_offset += chunk_size;

  } // end while

  if (need_dup)
@@ -341,18 +453,18 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokk
    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);

  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
    d_beta(ii,icoeff) = d_coeffi[icoeff+1];
    d_beta(icoeff,ii) = d_coeffi[icoeff+1];

  if (quadraticflag) {
    int k = ncoeff+1;
    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
      double bveci = my_sna.blist(ii,icoeff);
      d_beta(ii,icoeff) += d_coeffi[k]*bveci;
      double bveci = my_sna.blist(icoeff,ii);
      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
      k++;
      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
        double bvecj = my_sna.blist(ii,jcoeff);
        d_beta(ii,icoeff) += d_coeffi[k]*bvecj;
        d_beta(ii,jcoeff) += d_coeffi[k]*bveci;
        double bvecj = my_sna.blist(jcoeff,ii);
        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
        k++;
      }
    }
@@ -503,9 +615,14 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const int& ii) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.pre_ui(ii);

  // Extract the atom number
  const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  my_sna.pre_ui(team,ii);
}

template<class DeviceType>
@@ -514,8 +631,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -528,9 +644,54 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const int& ii) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;

  // Extract the number of neighbors neighbor number
  const int ninside = d_ninside(ii);

  my_sna.compute_uitot(team,idx,ii,ninside);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_ui_cpu(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.zero_yi(ii);

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;

  my_sna.zero_yi(idx,ii);
}

template<class DeviceType>
@@ -561,8 +722,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -573,14 +733,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  my_sna.compute_duidrj(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_duidrj_cpu(team,ii,jj);
}


template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
           ((chunk_size+team.team_size()-1)/team.team_size()));
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
@@ -591,6 +768,23 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const type
  my_sna.compute_deidrj(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_deidrj_cpu(team,ii,jj);
}

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
@@ -658,17 +852,17 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
        // linear contributions

        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
          evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,icoeff);
          evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);

        // quadratic contributions

        if (quadraticflag) {
          int k = ncoeff+1;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
            double bveci = my_sna.blist(ii,icoeff);
            double bveci = my_sna.blist(icoeff,ii);
            evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
              double bvecj = my_sna.blist(ii,jcoeff);
              double bvecj = my_sna.blist(jcoeff,ii);
              evdwl += d_coeffi[k++]*bveci*bvecj;
            }
          }
+91 −46

File changed.

Preview size limit exceeded, changes collapsed.

+573 −166

File changed.

Preview size limit exceeded, changes collapsed.