Commit a5d27763 authored by Evan Weinberg's avatar Evan Weinberg
Browse files

Added support for symmetrized memory layouts for ui/duidrj for the CPU- and...

Added support for symmetrized memory layouts for ui/duidrj for the CPU- and GPU-path SNAP Kokkos implementation, various perf optimizations for ComputeUi/ComputeFusedDeidrj
parent ac43f8f6
Loading
Loading
Loading
Loading
+12 −1
Original line number Original line Diff line number Diff line
@@ -1058,7 +1058,7 @@ struct alignas(2*sizeof(real)) SNAComplex
{
{
  real re,im;
  real re,im;


  KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
  SNAComplex() = default;


  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
   : re(re), im(static_cast<real>(0.)) { ; }
   : re(re), im(static_cast<real>(0.)) { ; }
@@ -1100,6 +1100,17 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAC


typedef SNAComplex<SNAreal> SNAcomplex;
typedef SNAComplex<SNAreal> SNAcomplex;


// Cayley-Klein pack
// Can guarantee it's aligned to 2 complex
struct alignas(32) CayleyKleinPack {

  SNAcomplex a, b;
  SNAcomplex da[3], db[3];
  SNAreal sfac;
  SNAreal dsfacu[3];

};



#if defined(KOKKOS_ENABLE_CXX11)
#if defined(KOKKOS_ENABLE_CXX11)
#undef ISFINITE
#undef ISFINITE
+5 −4
Original line number Original line Diff line number Diff line
@@ -50,6 +50,7 @@ struct TagPairSNAPComputeFusedDeidrj{};
// CPU backend only
// CPU backend only
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPTransformUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
struct TagPairSNAPComputeBiCPU{};
@@ -104,7 +105,7 @@ public:
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
@@ -135,13 +136,13 @@ public:
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
  void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;


  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
+114 −47
Original line number Original line Diff line number Diff line
@@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)


  EV_FLOAT ev;
  EV_FLOAT ev;


  int idxu_max = snaKK.idxu_max;

  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory


    EV_FLOAT ev_tmp;
    EV_FLOAT ev_tmp;
@@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
      }
      }


      {
        // Expand ulisttot -> ulisttot_full
        // Zero out ylist
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
        Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
      }

      //Compute bispectrum
      //Compute bispectrum
      if (quadraticflag || eflag) {
      if (quadraticflag || eflag) {
        //ComputeZi
        //ComputeZi
@@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
      }
      }


      //ZeroYi,ComputeYi
      //ComputeYi
      {
      {
        int vector_length = vector_length_default;
        int team_size = team_size_default;

        //Compute beta = dE_i/dB_i for all i in list
        //Compute beta = dE_i/dB_i for all i in list
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);


        //ZeroYi
        check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
        Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);

        //ComputeYi
        //ComputeYi
        int idxz_max = snaKK.idxz_max;
        int idxz_max = snaKK.idxz_max;
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
@@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)


        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
      }
      }

    } else { // GPU
    } else { // GPU


#ifdef LMP_KOKKOS_GPU
#ifdef LMP_KOKKOS_GPU
@@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        int team_size = 4; // need to cap b/c of shared memory reqs
        int team_size = 4; // need to cap b/c of shared memory reqs
        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);


        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
        //   2 is for double buffer
        //   2 is for double buffer


        const int tile_size = (twojmax+1)*(twojmax+1);
        const int tile_size = (twojmax+1)*(twojmax/2+1);
        typedef Kokkos::View< SNAcomplex*,
        typedef Kokkos::View< SNAcomplex*,
                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
                              Kokkos::MemoryTraits<Kokkos::Unmanaged> >
                              Kokkos::MemoryTraits<Kokkos::Unmanaged> >
@@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUi",policy_ui,*this);
        Kokkos::parallel_for("ComputeUi",policy_ui,*this);


        //Transform data layout of ulisttot to AoSoA, zero ylist
        //Transform data layout of ulisttot to AoSoA, zero ylist
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);


      }
      }
@@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);


        //Transform data layout of ylist out of AoSoA
        //Transform data layout of ylist out of AoSoA
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        const int idxu_half_max = snaKK.idxu_half_max;
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);


      }
      }
@@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        }
        }
      }
      }


#endif // KOKKOS_ENABLE_CUDA
#endif // LMP_KOKKOS_GPU


    }
    }


@@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen


    if ( rsq < rnd_cutsq(itype,jtype) ) {
    if ( rsq < rnd_cutsq(itype,jtype) ) {
      if (final) {
      if (final) {
#ifdef LMP_KOKKOS_GPU
        if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
          my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
                                      d_wjelem[elem_j]);
        } else {
#endif
          my_sna.rij(ii,offset,0) = dx;
          my_sna.rij(ii,offset,0) = dx;
          my_sna.rij(ii,offset,1) = dy;
          my_sna.rij(ii,offset,1) = dy;
          my_sna.rij(ii,offset,2) = dz;
          my_sna.rij(ii,offset,2) = dz;
        my_sna.inside(ii,offset) = j;
          my_sna.wj(ii,offset) = d_wjelem[elem_j];
          my_sna.wj(ii,offset) = d_wjelem[elem_j];
          my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
          my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
#ifdef LMP_KOKKOS_GPU
        }
#endif
        my_sna.inside(ii,offset) = j;
        if (chemflag)
        if (chemflag)
          my_sna.element(ii,offset) = elem_j;
          my_sna.element(ii,offset) = elem_j;
        else
        else
@@ -704,27 +712,56 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename


template<class DeviceType>
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  SNAKokkos<DeviceType> my_sna = snaKK;


  const int iatom = iatom_mod + iatom_div * 32;
  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;
  if (iatom >= chunk_size) return;


  if (idxu >= my_sna.idxu_max) return;
  if (j > twojmax) return; 


  int elem_count = chemflag ? nelements : 1;
  int elem_count = chemflag ? nelements : 1;


  for (int ielem = 0; ielem < elem_count; ielem++) {
  for (int ielem = 0; ielem < elem_count; ielem++) {
    const int jju_half = my_sna.idxu_half_block(j);
    const int jju = my_sna.idxu_block(j);

    for (int mb = 0; 2*mb <= j; mb++) {
      for (int ma = 0; ma <= j; ma++) {
        // Extract top half


    const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
        const int idxu_shift = mb * (j + 1) + ma;
    const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
        const int idxu_half = jju_half + idxu_shift;
        const int idxu = jju + idxu_shift;


        auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
        auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);

        // Store
        my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
        my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
          
          
    my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
        // Also zero yi
    my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
        my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
        my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;

        // Symmetric term
        const int sign_factor = (((ma+mb)%2==0)?1:-1);
        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);

        if (sign_factor == 1) {
          utot_im = -utot_im;
        } else {
          utot_re = -utot_re;
        }
        }


        my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };

        // No need to zero symmetrized ylist
        //my_sna.ylist_pack_re(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
        //my_sna.ylist_pack_im(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
      }
    }
  }
}
}


template<class DeviceType>
template<class DeviceType>
@@ -742,20 +779,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato


template<class DeviceType>
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  SNAKokkos<DeviceType> my_sna = snaKK;


  const int iatom = iatom_mod + iatom_div * 32;
  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;
  if (iatom >= chunk_size) return;


  if (idxu >= my_sna.idxu_max) return;
  if (idxu_half >= my_sna.idxu_half_max) return;


  int elem_count = chemflag ? nelements : 1;
  int elem_count = chemflag ? nelements : 1;
  for (int ielem = 0; ielem < elem_count; ielem++) {
  for (int ielem = 0; ielem < elem_count; ielem++) {
    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);


    my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
    my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
  }
  }


}
}
@@ -904,22 +941,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen


template<class DeviceType>
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  SNAKokkos<DeviceType> my_sna = snaKK;


  // Extract the quantum number
  if (iatom >= chunk_size) return;
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;


  // Extract the atomic index
  if (j > twojmax) return; 
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;


  if (chemflag)
  int elem_count = chemflag ? nelements : 1;
    for(int ielem = 0; ielem < nelements; ielem++)

      my_sna.zero_yi_cpu(idx,ii,ielem);
  // De-symmetrize ulisttot
  else
  for (int ielem = 0; ielem < elem_count; ielem++) {
    my_sna.zero_yi_cpu(idx,ii,0);

    const int jju_half = my_sna.idxu_half_block(j);
    const int jju = my_sna.idxu_block(j);

    for (int mb = 0; 2*mb <= j; mb++) {
      for (int ma = 0; ma <= j; ma++) {
        // Extract top half

        const int idxu_shift = mb * (j + 1) + ma;
        const int idxu_half = jju_half + idxu_shift;
        const int idxu = jju + idxu_shift;

        // Load ulist
        auto utot = my_sna.ulisttot(idxu_half, ielem, iatom);

        // Store
        my_sna.ulisttot_full(idxu, ielem, iatom) = utot;

        // Zero Yi
        my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.};

        // Symmetric term
        const int sign_factor = (((ma+mb)%2==0)?1:-1);
        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);

        if (sign_factor == 1) {
          utot.im = -utot.im;
        } else {
          utot.re = -utot.re;
        }

        my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot;
      }
    }
  }
}
}


template<class DeviceType>
template<class DeviceType>
+18 −3
Original line number Original line Diff line number Diff line
@@ -55,6 +55,8 @@ public:
  typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
  typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
  typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
  typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;


  typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp; 

inline
inline
  SNAKokkos() {};
  SNAKokkos() {};
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
@@ -78,6 +80,9 @@ inline


  // functions for bispectrum coefficients, GPU only
  // functions for bispectrum coefficients, GPU only
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void compute_cayley_klein(const int&, const int&, const double&, const double&,
                            const double&, const double&, const double&);
  KOKKOS_INLINE_FUNCTION
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
@@ -97,8 +102,6 @@ inline
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void compute_zi_cpu(const int&);    // ForceSNAP
  void compute_zi_cpu(const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_yi_cpu(int,
  void compute_yi_cpu(int,
   const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
   const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
    KOKKOS_INLINE_FUNCTION
    KOKKOS_INLINE_FUNCTION
@@ -117,6 +120,8 @@ inline
  double compute_sfac(double, double); // add_uarraytot, compute_duarray
  double compute_sfac(double, double); // add_uarraytot, compute_duarray
  KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  double compute_dsfac(double, double); // compute_duarray
  double compute_dsfac(double, double); // compute_duarray
  KOKKOS_INLINE_FUNCTION
  void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein


  // efficient complex FMA
  // efficient complex FMA
  // efficient caxpy (i.e., y += a x)
  // efficient caxpy (i.e., y += a x)
@@ -140,6 +145,9 @@ inline


  //per sna class instance for OMP use
  //per sna class instance for OMP use


  // Alternative to rij, wj, rcutij...
  // just calculate everything up front
  t_sna_2ckp cayleyklein;


  // Per InFlight Particle
  // Per InFlight Particle
  t_sna_3d rij;
  t_sna_3d rij;
@@ -156,6 +164,7 @@ inline


  t_sna_3d_ll blist;
  t_sna_3d_ll blist;
  t_sna_3c_ll ulisttot;
  t_sna_3c_ll ulisttot;
  t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
  t_sna_3c_ll zlist;
  t_sna_3c_ll zlist;


  t_sna_3c_ll ulist;
  t_sna_3c_ll ulist;
@@ -173,7 +182,7 @@ inline
  t_sna_4d_ll ylist_pack_re; // split real,
  t_sna_4d_ll ylist_pack_re; // split real,
  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout


  int idxcg_max, idxu_max, idxz_max, idxb_max;
  int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;


  // Chem snap counts
  // Chem snap counts
  int nelements;
  int nelements;
@@ -188,7 +197,13 @@ private:
  Kokkos::View<int*[10], DeviceType> idxz;
  Kokkos::View<int*[10], DeviceType> idxz;
  Kokkos::View<int*[3], DeviceType> idxb;
  Kokkos::View<int*[3], DeviceType> idxb;
  Kokkos::View<int***, DeviceType> idxcg_block;
  Kokkos::View<int***, DeviceType> idxcg_block;

public:
  Kokkos::View<int*, DeviceType> idxu_block;
  Kokkos::View<int*, DeviceType> idxu_block;
  Kokkos::View<int*, DeviceType> idxu_half_block;
  Kokkos::View<int*, DeviceType> idxu_cache_block;

private:
  Kokkos::View<int***, DeviceType> idxz_block;
  Kokkos::View<int***, DeviceType> idxz_block;
  Kokkos::View<int***, DeviceType> idxb_block;
  Kokkos::View<int***, DeviceType> idxb_block;


+380 −242

File changed.

Preview size limit exceeded, changes collapsed.