Unverified Commit 2fea8f88 authored by Axel Kohlmeyer's avatar Axel Kohlmeyer
Browse files

Merge branch 'master' into collected-small-changes

parents 3aee1b75 d63f3d87
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ Syntax
Examples
""""""""

.. code-blocK:: LAMMPS
.. code-block:: LAMMPS

   bond_style none

+1 −1
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ Syntax
Examples
""""""""

.. code:: LAMMPS
.. code-block:: LAMMPS

   pair_style meam/spline
   pair_coeff * * Ti.meam.spline Ti
+12 −1
Original line number Diff line number Diff line
@@ -1058,7 +1058,7 @@ struct alignas(2*sizeof(real)) SNAComplex
{
  real re,im;

  KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
  SNAComplex() = default;

  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
   : re(re), im(static_cast<real>(0.)) { ; }
@@ -1100,6 +1100,17 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAC

typedef SNAComplex<SNAreal> SNAcomplex;

// Cayley-Klein pack
// Can guarantee it's aligned to 2 complex
struct alignas(32) CayleyKleinPack {

  SNAcomplex a, b;
  SNAcomplex da[3], db[3];
  SNAreal sfac;
  SNAreal dsfacu[3];

};


#if defined(KOKKOS_ENABLE_CXX11)
#undef ISFINITE
+5 −4
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@ struct TagPairSNAPComputeFusedDeidrj{};
// CPU backend only
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPTransformUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
@@ -104,7 +105,7 @@ public:
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
@@ -135,13 +136,13 @@ public:
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
  void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
+112 −47
Original line number Diff line number Diff line
@@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  EV_FLOAT ev;

  int idxu_max = snaKK.idxu_max;

  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory

    EV_FLOAT ev_tmp;
@@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
      }

      {
        // Expand ulisttot -> ulisttot_full
        // Zero out ylist
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
        Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
      }

      //Compute bispectrum
      if (quadraticflag || eflag) {
        //ComputeZi
@@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
      }

      //ZeroYi,ComputeYi
      //ComputeYi
      {
        int vector_length = vector_length_default;
        int team_size = team_size_default;

        //Compute beta = dE_i/dB_i for all i in list
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);

        //ZeroYi
        check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
        Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);

        //ComputeYi
        int idxz_max = snaKK.idxz_max;
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
@@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
      }

    } else { // GPU

#ifdef LMP_KOKKOS_GPU
@@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        int team_size = 4; // need to cap b/c of shared memory reqs
        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);

        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
        //   2 is for double buffer

        const int tile_size = (twojmax+1)*(twojmax+1);
        const int tile_size = (twojmax+1)*(twojmax/2+1);
        typedef Kokkos::View< SNAcomplex*,
                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
                              Kokkos::MemoryTraits<Kokkos::Unmanaged> >
@@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUi",policy_ui,*this);

        //Transform data layout of ulisttot to AoSoA, zero ylist
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);

      }
@@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);

        //Transform data layout of ylist out of AoSoA
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        const int idxu_half_max = snaKK.idxu_half_max;
        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);

      }
@@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        }
      }

#endif // KOKKOS_ENABLE_CUDA
#endif // LMP_KOKKOS_GPU

    }

@@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen

    if ( rsq < rnd_cutsq(itype,jtype) ) {
      if (final) {
#ifdef LMP_KOKKOS_GPU
        if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
          my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
                                      d_wjelem[elem_j]);
        } else {
#endif
          my_sna.rij(ii,offset,0) = dx;
          my_sna.rij(ii,offset,1) = dy;
          my_sna.rij(ii,offset,2) = dz;
        my_sna.inside(ii,offset) = j;
          my_sna.wj(ii,offset) = d_wjelem[elem_j];
          my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
#ifdef LMP_KOKKOS_GPU
        }
#endif
        my_sna.inside(ii,offset) = j;
        if (chemflag)
          my_sna.element(ii,offset) = elem_j;
        else
@@ -704,27 +712,54 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;

  if (idxu >= my_sna.idxu_max) return;
  if (j > twojmax) return; 

  int elem_count = chemflag ? nelements : 1;

  for (int ielem = 0; ielem < elem_count; ielem++) {
    const int jju_half = my_sna.idxu_half_block(j);
    const int jju = my_sna.idxu_block(j);

    for (int mb = 0; 2*mb <= j; mb++) {
      for (int ma = 0; ma <= j; ma++) {
        // Extract top half

    const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
    const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
        const int idxu_shift = mb * (j + 1) + ma;
        const int idxu_half = jju_half + idxu_shift;
        const int idxu = jju + idxu_shift;

        auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
        auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);

        // Store
        my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
          
    my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
    my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
        // Also zero yi
        my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
        my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;

        // Symmetric term
        const int sign_factor = (((ma+mb)%2==0)?1:-1);
        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);

        if (sign_factor == 1) {
          utot_im = -utot_im;
        } else {
          utot_re = -utot_re;
        }

        my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };

        // No need to zero symmetrized ylist
      }
    }
  }
}

template<class DeviceType>
@@ -742,20 +777,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;

  if (idxu >= my_sna.idxu_max) return;
  if (idxu_half >= my_sna.idxu_half_max) return;

  int elem_count = chemflag ? nelements : 1;
  for (int ielem = 0; ielem < elem_count; ielem++) {
    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);

    my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
    my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
  }

}
@@ -904,22 +939,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;
  if (iatom >= chunk_size) return;

  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;
  if (j > twojmax) return; 

  if (chemflag)
    for(int ielem = 0; ielem < nelements; ielem++)
      my_sna.zero_yi_cpu(idx,ii,ielem);
  else
    my_sna.zero_yi_cpu(idx,ii,0);
  int elem_count = chemflag ? nelements : 1;

  // De-symmetrize ulisttot
  for (int ielem = 0; ielem < elem_count; ielem++) {

    const int jju_half = my_sna.idxu_half_block(j);
    const int jju = my_sna.idxu_block(j);

    for (int mb = 0; 2*mb <= j; mb++) {
      for (int ma = 0; ma <= j; ma++) {
        // Extract top half

        const int idxu_shift = mb * (j + 1) + ma;
        const int idxu_half = jju_half + idxu_shift;
        const int idxu = jju + idxu_shift;

        // Load ulist
        auto utot = my_sna.ulisttot(idxu_half, ielem, iatom);

        // Store
        my_sna.ulisttot_full(idxu, ielem, iatom) = utot;

        // Zero Yi
        my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.};

        // Symmetric term
        const int sign_factor = (((ma+mb)%2==0)?1:-1);
        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);

        if (sign_factor == 1) {
          utot.im = -utot.im;
        } else {
          utot.re = -utot.re;
        }

        my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot;
      }
    }
  }
}

template<class DeviceType>
Loading