Commit bcfc606e authored by Evan Weinberg's avatar Evan Weinberg
Browse files

SNAP optimizations, kernel fusion, large reduction of memory usage on the GPU,...

SNAP optimizations, kernel fusion, large reduction of memory usage on the GPU, misc. performance optimizations.
parent b533fdb3
Loading
Loading
Loading
Loading
+2 −10
Original line number Diff line number Diff line
@@ -37,15 +37,13 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeFusedDeidrj{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrj{};
struct TagPairSNAPComputeDeidrjCPU{};

template<class DeviceType>
@@ -83,9 +81,6 @@ public:
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;

@@ -102,14 +97,11 @@ public:
  void operator() (TagPairSNAPComputeYi,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
  void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;

+25 −86
Original line number Diff line number Diff line
@@ -30,7 +30,6 @@
#include "kokkos.h"
#include "sna.h"


#define MAXLINE 1024
#define MAXWORD 3

@@ -255,26 +254,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

      // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // 2 is for double buffer
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      const int tile_size = (twojmax+1)*(twojmax+1);
      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));

      Kokkos::parallel_for("ComputeUi",policy_ui,*this);

      // ComputeUitot
      vector_length = 1;
      team_size = 128;
      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
    }


@@ -316,7 +308,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);

    //ComputeDuidrj
    //ComputeDuidrj and Deidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
@@ -324,53 +316,37 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      snaKK.set_dir(-1); // technically doesn't do anything
      Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::team_size_max(*this);
      int vector_length = 32;
      int team_size = 2; // need to cap b/c of shared memory reqs
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
      // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
      // 2 is for double buffer
      const int tile_size = (twojmax+1)*(twojmax/2+1);

      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));

      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      // Need to call three times, once for each direction
      for (int k = 0; k < 3; k++) {
        snaKK.set_dir(k);
        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
        Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
      }
    }

    //ComputeDeidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);

    } else { // GPU, different loop strategy internally

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
      int vector_length = 32; // coalescing disaster right now, will fix later
      int team_size = 8;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;

      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
    }

    //ComputeForce
    if (eflag) {
      if (neighflag == HALF) {
@@ -642,25 +618,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
  my_sna.compute_ui(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;

  // Extract the number of neighbors neighbor number
  const int ninside = d_ninside(ii);

  my_sna.compute_uitot(team,idx,ii,ninside);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
@@ -718,7 +675,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
@@ -730,7 +687,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_duidrj(team,ii,jj);
  my_sna.compute_fused_deidrj(team,ii,jj);
}

template<class DeviceType>
@@ -750,24 +707,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
  my_sna.compute_duidrj_cpu(team,ii,jj);
}


template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;

  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;

  my_sna.compute_deidrj(team,ii,jj);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
+3 −17
Original line number Diff line number Diff line
@@ -135,14 +135,10 @@ inline
  KOKKOS_INLINE_FUNCTION
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_zi(const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void zero_yi(const int&,const int&); // ForceSNAP
@@ -155,12 +151,10 @@ inline
  // functions for derivatives

  KOKKOS_INLINE_FUNCTION
  void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  double compute_sfac(double, double); // add_uarraytot, compute_duarray
@@ -251,10 +245,6 @@ inline
  KOKKOS_INLINE_FUNCTION
  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui

  KOKKOS_INLINE_FUNCTION
  void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                      double, double, double,
                      double, double); // compute_ui
  KOKKOS_INLINE_FUNCTION
  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                      double, double, double,
@@ -267,12 +257,8 @@ inline
inline
  int compute_ncoeff();           // SNAKokkos()
  KOKKOS_INLINE_FUNCTION
  void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                       double, double, double, // compute_duidrj
                       double, double, double, double, double);
  KOKKOS_INLINE_FUNCTION
  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                       double, double, double, // compute_duidrj
                       double, double, double, // compute_duidrj_cpu
                       double, double, double, double, double);

  // Sets the style for the switching function
+319 −357

File changed.

Preview size limit exceeded, changes collapsed.