Commit b13be8f9 authored by charlie sievers's avatar charlie sievers
Browse files

Merge branch 'master' into lammps_gjf

parents 63c1ffd6 bd269ac5
Loading
Loading
Loading
Loading
+7 −3
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@ struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeDeidrj{};
@@ -73,19 +74,22 @@ public:
  void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
  void operator() (TagPairSNAPPreUi,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const;
  void operator() (TagPairSNAPComputeZi,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const;
  void operator() (TagPairSNAPZeroYi,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYi,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
+26 −23
Original line number Diff line number Diff line
@@ -184,22 +184,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Experimental::Max<int>(max_neighs));

  int vector_length = 1;
  int ui_vector_length = 1;
  int team_size = 1;
  int yi_team_size = 1;
  int team_size_max = Kokkos::TeamPolicy<DeviceType>::team_size_max(*this);
#ifdef KOKKOS_ENABLE_CUDA
  team_size = 32;//max_neighs;
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;

  yi_team_size = 256;
  if (yi_team_size*vector_length > team_size_max)
    yi_team_size = team_size_max/vector_length;

  ui_vector_length = 8;
  if (team_size*ui_vector_length > team_size_max)
    team_size = team_size_max/ui_vector_length;
#endif

  if (beta_max < inum) {
@@ -227,17 +217,21 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);

    //PreUi
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi> policy_preui(chunk_size,team_size,vector_length);
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPPreUi> policy_preui(0,chunk_size);
    Kokkos::parallel_for("PreUi",policy_preui,*this);

    //ComputeUi
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,ui_vector_length);
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,vector_length);
    Kokkos::parallel_for("ComputeUi",policy_ui,*this);

    //Ulisttot transpose
    snaKK.transpose_ulisttot();

    //Compute bispectrum
    if (quadraticflag || eflag) {
      //ComputeZi
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi> policy_zi(chunk_size,team_size,vector_length);
      int idxz_max = snaKK.idxz_max;
      typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
      Kokkos::parallel_for("ComputeZi",policy_zi,*this);

      //ComputeBi
@@ -250,7 +244,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeBeta",policy_beta,*this);

    //ComputeYi
    typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(chunk_size,yi_team_size,vector_length);
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(0,chunk_size);
    Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);

    //ComputeYi
    int idxz_max = snaKK.idxz_max;
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);

    //ComputeDuidrj
@@ -504,10 +503,9 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
  int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const int& ii) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.pre_ui(team,ii);
  my_sna.pre_ui(ii);
}

template<class DeviceType>
@@ -529,18 +527,23 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const {
  int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const int& ii) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.compute_yi(team,ii,d_beta);
  my_sna.zero_yi(ii);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const {
  int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.compute_yi(ii,d_beta);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  my_sna.compute_zi(team,ii);
  my_sna.compute_zi(ii);
}

template<class DeviceType>
+57 −17
Original line number Diff line number Diff line
@@ -26,16 +26,29 @@
namespace LAMMPS_NS {

typedef double SNAreal;
typedef struct { SNAreal re, im; } SNAcomplex;

struct SNAKK_ZINDICES {
  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
};
//typedef struct { SNAreal re, im; } SNAcomplex;
struct alignas(2*sizeof(SNAreal)) SNAcomplex{
  SNAreal re, im;

  KOKKOS_INLINE_FUNCTION
  SNAcomplex() : re(0),im(0)
  {}

struct SNAKK_BINDICES {
  int j1, j2, j;
  KOKKOS_INLINE_FUNCTION
  SNAcomplex(SNAreal real_in, SNAreal imag_in)
      :re(real_in),im(imag_in)
  {}
};

//struct SNAKK_ZINDICES {
//  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
//};
//
//struct SNAKK_BINDICES {
//  int j1, j2, j;
//};

template<class DeviceType>
class SNAKokkos {

@@ -53,12 +66,32 @@ public:
  typedef Kokkos::View<SNAcomplex*, DeviceType> t_sna_1c;
  typedef Kokkos::View<SNAcomplex*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
  typedef Kokkos::View<SNAcomplex**, DeviceType> t_sna_2c;
  typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_cpu;
  typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
  typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
  typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
  typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
  typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;

// Helper class to get ulisttot_r

template<typename DeviceLayout, typename T1, typename T2>
class UlisttotHelper {
public:
  inline
  static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
    Kokkos::deep_copy(ulisttot_lr,ulisttot);
  }
};

template<typename T1, typename T2>
class UlisttotHelper<Kokkos::LayoutRight,T1,T2> {
public:
  inline
  static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
    ulisttot_lr = ulisttot;
  }
};

inline
  SNAKokkos() {};
  KOKKOS_INLINE_FUNCTION
@@ -80,17 +113,22 @@ inline

  int ncoeff;

inline
  void transpose_ulisttot();

  // functions for bispectrum coefficients
  KOKKOS_INLINE_FUNCTION
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
  void pre_ui(const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
  void compute_zi(const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void zero_yi(const int&);
  KOKKOS_INLINE_FUNCTION
  void compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int,
  void compute_yi(int,
   const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
@@ -129,23 +167,25 @@ inline
  int twojmax, diagonalstyle;
  
  t_sna_2d blist;
  t_sna_2c_cpu ulisttot;
  t_sna_2c ulisttot;
  t_sna_2c_lr ulisttot_lr;
  t_sna_2c zlist;

  t_sna_3c ulist;
  t_sna_2c ylist;
  t_sna_2c_lr ylist;

  // derivatives of data
  t_sna_4c dulist;

  int idxcg_max, idxu_max, idxz_max, idxb_max;

private:
  double rmin0, rfac0;

  //use indexlist instead of loops, constructor generates these
  // Same across all SNAKokkos
  Kokkos::View<SNAKK_ZINDICES*, DeviceType> idxz;
  Kokkos::View<SNAKK_BINDICES*, DeviceType> idxb;
  int idxcg_max, idxu_max, idxz_max, idxb_max;
  Kokkos::View<int*[10], DeviceType> idxz;
  Kokkos::View<int*[3], DeviceType> idxb;
  Kokkos::View<int***, DeviceType> idxcg_block;
  Kokkos::View<int*, DeviceType> idxu_block;
  Kokkos::View<int***, DeviceType> idxz_block;
@@ -173,9 +213,9 @@ inline
inline
  void init_rootpqarray();    // init()
  KOKKOS_INLINE_FUNCTION
  void zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);      // compute_ui
  void zero_uarraytot(const int&);      // compute_ui
  KOKKOS_INLINE_FUNCTION
  void addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, double); // compute_ui
  void addself_uarraytot(const int&, const double&); // compute_ui
  KOKKOS_INLINE_FUNCTION
  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui

+171 −212
Original line number Diff line number Diff line
@@ -117,7 +117,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
        if (j >= j1) idxb_count++;
  
  idxb_max = idxb_count;
  idxb = Kokkos::View<SNAKK_BINDICES*, DeviceType>("SNAKokkos::idxb",idxb_max);
  idxb = Kokkos::View<int*[3], DeviceType>("SNAKokkos::idxb",idxb_max);
  auto h_idxb = Kokkos::create_mirror_view(idxb);
  
  idxb_count = 0;
@@ -125,9 +125,9 @@ void SNAKokkos<DeviceType>::build_indexlist()
    for(int j2 = 0; j2 <= j1; j2++)
      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
        if (j >= j1) {
          h_idxb[idxb_count].j1 = j1;
          h_idxb[idxb_count].j2 = j2;
          h_idxb[idxb_count].j = j;
          h_idxb(idxb_count,0) = j1;
          h_idxb(idxb_count,1) = j2;
          h_idxb(idxb_count,2) = j;
          idxb_count++;
        }
  Kokkos::deep_copy(idxb,h_idxb);
@@ -160,7 +160,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
            idxz_count++;
  
  idxz_max = idxz_count;
  idxz = Kokkos::View<SNAKK_ZINDICES*, DeviceType>("SNAKokkos::idxz",idxz_max);
  idxz = Kokkos::View<int*[10], DeviceType>("SNAKokkos::idxz",idxz_max);
  auto h_idxz = Kokkos::create_mirror_view(idxz);

  idxz_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxz_block", jdim,jdim,jdim);
@@ -178,20 +178,20 @@ void SNAKokkos<DeviceType>::build_indexlist()

        for (int mb = 0; 2*mb <= j; mb++)
          for (int ma = 0; ma <= j; ma++) {
            h_idxz[idxz_count].j1 = j1;
            h_idxz[idxz_count].j2 = j2;
            h_idxz[idxz_count].j = j;
            h_idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
            h_idxz[idxz_count].ma2max = (2 * ma - j - (2 * h_idxz[idxz_count].ma1min - j1) + j2) / 2;
            h_idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz[idxz_count].ma1min + 1;
            h_idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
            h_idxz[idxz_count].mb2max = (2 * mb - j - (2 * h_idxz[idxz_count].mb1min - j1) + j2) / 2;
            h_idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz[idxz_count].mb1min + 1;
            h_idxz(idxz_count,0) = j1;
            h_idxz(idxz_count,1) = j2;
            h_idxz(idxz_count,2) = j;
            h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2);
            h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2;
            h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2);
            h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2;
            h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1;
            h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;

            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)

            const int jju = h_idxu_block[j] + (j+1)*mb + ma;
            h_idxz[idxz_count].jju = jju;
            h_idxz(idxz_count,9) = jju;

            idxz_count++;
          }
@@ -225,11 +225,13 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
  dedr = t_sna_3d("sna:dedr",natom,nmax,3);

  blist = t_sna_2d("sna:blist",natom,idxb_max);
  ulisttot = t_sna_2c_cpu("sna:ulisttot",natom,idxu_max);
  ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
  if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value) 
    ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max);
  zlist = t_sna_2c("sna:zlist",natom,idxz_max);

  ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
  ylist = t_sna_2c("sna:ylist",natom,idxu_max);
  ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);

  dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
}
@@ -240,15 +242,15 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::pre_ui(const int& iatom)
{
  if(team.team_rank() == 0) {
    zero_uarraytot(team,iatom);
  //if(team.team_rank() == 0) {
    zero_uarraytot(iatom);
    //Kokkos::single(Kokkos::PerThread(team), [&] (){
    addself_uarraytot(team,iatom,wself);
    addself_uarraytot(iatom,wself);
    //});
  }
  team.team_barrier();
  //}
  //team.team_barrier();
}

/* ----------------------------------------------------------------------
@@ -278,50 +280,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
  z0 = r / tan(theta0);

  compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
  //Kokkos::single(Kokkos::PerThread(team), [&] (){
  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor));
  //});
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnum)
{
  double rsq, r, x, y, z, z0, theta0;

  // utot(j,ma,mb) = 0 for all j,ma,ma
  // utot(j,ma,ma) = 1 for all j,ma
  // for j in neighbors of i:
  //   compute r0 = (x,y,z,z0)
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb

  if(team.team_rank() == 0) {
    zero_uarraytot(team,iatom);
    //Kokkos::single(Kokkos::PerThread(team), [&] (){
    addself_uarraytot(team,iatom,wself);
    //});
  }
  team.team_barrier();

  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum),
      [&] (const int& j) {
  //for(int j = 0; j < jnum; j++) {
    x = rij(iatom,j,0);
    y = rij(iatom,j,1);
    z = rij(iatom,j,2);
    rsq = x * x + y * y + z * z;
    r = sqrt(rsq);

    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,j) - rmin0);
    //    theta0 = (r - rmin0) * rscale0;
    z0 = r / tan(theta0);

    compute_uarray(team, iatom, j, x, y, z, z0, r);
    //Kokkos::single(Kokkos::PerThread(team), [&] (){
    add_uarraytot(team, iatom, j, r, wj(iatom,j), rcutij(iatom,j));
    //});
  });

}

/* ----------------------------------------------------------------------
@@ -330,20 +289,20 @@ void SNAKokkos<DeviceType>::compute_ui_orig(const typename Kokkos::TeamPolicy<De

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::compute_zi(const int& iter)
{
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
      [&] (const int& jjz) {
  //for(int jjz = 0; jjz < idxz_max; jjz++) {
    const int j1 = idxz[jjz].j1;
    const int j2 = idxz[jjz].j2;
    const int j = idxz[jjz].j;
    const int ma1min = idxz[jjz].ma1min;
    const int ma2max = idxz[jjz].ma2max;
    const int na = idxz[jjz].na;
    const int mb1min = idxz[jjz].mb1min;
    const int mb2max = idxz[jjz].mb2max;
    const int nb = idxz[jjz].nb;
  const int iatom = iter / idxz_max;
  const int jjz = iter % idxz_max;

  const int j1 = idxz(jjz,0);
  const int j2 = idxz(jjz,1);
  const int j = idxz(jjz,2);
  const int ma1min = idxz(jjz,3);
  const int ma2max = idxz(jjz,4);
  const int mb1min = idxz(jjz,5);
  const int mb2max = idxz(jjz,6);
  const int na = idxz(jjz,7);
  const int nb = idxz(jjz,8);

  const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);

@@ -376,8 +335,6 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT
    jju2 -= j2+1;
    icgb += j2;
  } // end loop over ib

  }); // end loop over jjz
}

/* ----------------------------------------------------------------------
@@ -386,34 +343,35 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom,
 const Kokkos::View<F_FLOAT**, DeviceType> &beta)
{
  double betaj;
  const int ii = iatom;

void SNAKokkos<DeviceType>::zero_yi(const int& iatom)
{
    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)),
        [&] (const int& i) {
      ylist(iatom,i).re = 0.0;
      ylist(iatom,i).im = 0.0;
    });
    for (int j = 0; j < idxu_max; j++)
      ylist(iatom,j) = {0.0,0.0};
}

  //int flopsum = 0;
/* ----------------------------------------------------------------------
   compute Yi from Ui without storing Zi, looping over zlist indices
------------------------------------------------------------------------- */

  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
      [&] (const int& jjz) {
  //for(int jjz = 0; jjz < idxz_max; jjz++) {
    const int j1 = idxz[jjz].j1;
    const int j2 = idxz[jjz].j2;
    const int j = idxz[jjz].j;
    const int ma1min = idxz[jjz].ma1min;
    const int ma2max = idxz[jjz].ma2max;
    const int na = idxz[jjz].na;
    const int mb1min = idxz[jjz].mb1min;
    const int mb2max = idxz[jjz].mb2max;
    const int nb = idxz[jjz].nb;
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(int iter,
 const Kokkos::View<F_FLOAT**, DeviceType> &beta)
{
  double betaj;
  const int iatom = iter / idxz_max;
  const int jjz = iter % idxz_max;

  const int j1 = idxz(jjz,0);
  const int j2 = idxz(jjz,1);
  const int j = idxz(jjz,2);
  const int ma1min = idxz(jjz,3);
  const int ma2max = idxz(jjz,4);
  const int mb1min = idxz(jjz,5);
  const int mb2max = idxz(jjz,6);
  const int na = idxz(jjz,7);
  const int nb = idxz(jjz,8);
  const int jju = idxz(jjz,9);

  const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
@@ -435,9 +393,8 @@ void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceT
    int icga = ma1min*(j2+1) + ma2max;

    for(int ia = 0; ia < na; ia++) {
        suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
        suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
        //flopsum += 10;
      suma1_r += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).re - ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).im);
      suma1_i += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).im + ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).re);
      ma1++;
      ma2--;
      icga += j2;
@@ -451,37 +408,29 @@ void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceT
  } // end loop over ib

  // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
    // find right y_list[jju] and beta(ii,jjb) entries
  // find right y_list[jju] and beta(iatom,jjb) entries
  // multiply and divide by j+1 factors
  // account for multiplicity of 1, 2, or 3

    const int jju = idxz[jjz].jju;

  // pick out right beta value

  if (j >= j1) {
    const int jjb = idxb_block(j1,j2,j);
    if (j1 == j) {
        if (j2 == j) betaj = 3*beta(ii,jjb);
        else betaj = 2*beta(ii,jjb);
      } else betaj = beta(ii,jjb); 
      if (j2 == j) betaj = 3*beta(iatom,jjb);
      else betaj = 2*beta(iatom,jjb);
    } else betaj = beta(iatom,jjb); 
  } else if (j >= j2) {
    const int jjb = idxb_block(j,j2,j1);
      if (j2 == j) betaj = 2*beta(ii,jjb)*(j1+1)/(j+1.0);
      else betaj = beta(ii,jjb)*(j1+1)/(j+1.0);
    if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0);
    else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
  } else {
    const int jjb = idxb_block(j2,j,j1);
      betaj = beta(ii,jjb)*(j1+1)/(j+1.0);
    betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
  }

  Kokkos::single(Kokkos::PerThread(team), [&] () {
  Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r);
  Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i);
  });

  }); // end loop over jjz

  //printf("sum %i\n",flopsum);
}

/* ----------------------------------------------------------------------
@@ -556,9 +505,9 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
      [&] (const int& jjb) {
  //for(int jjb = 0; jjb < idxb_max; jjb++) {
    const int j1 = idxb[jjb].j1;
    const int j2 = idxb[jjb].j2;
    const int j = idxb[jjb].j;
    const int j1 = idxb(jjb,0);
    const int j2 = idxb(jjb,1);
    const int j = idxb(jjb,2);

    int jjz = idxz_block(j1,j2,j);
    int jju = idxu_block[j];
@@ -648,14 +597,16 @@ void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<Dev

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::zero_uarraytot(const int& iatom)
{
  {
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
        [&] (const int& i) {
    //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
    //    [&] (const int& i) {
    for (int i = 0; i < ulisttot.extent(1); i++) {
      ulisttot(iatom,i).re = 0.0;
      ulisttot(iatom,i).im = 0.0;
    });
    }
    //});
  }
}

@@ -663,18 +614,18 @@ void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<Dev

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, double wself_in)
void SNAKokkos<DeviceType>::addself_uarraytot(const int& iatom, const double& wself_in)
{
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
    [&] (const int& j) {
  //for (int j = 0; j <= twojmax; j++)
  //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
  //  [&] (const int& j) {
  for (int j = 0; j <= twojmax; j++) {
    int jju = idxu_block[j];
    for (int ma = 0; ma <= j; ma++) {
      ulisttot(iatom,jju).re = wself_in;
      ulisttot(iatom,jju).im = 0.0;
      jju += j+2;
    }
  });
  }//});
}

/* ----------------------------------------------------------------------
@@ -786,6 +737,12 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
  }
}

template<class DeviceType>
void SNAKokkos<DeviceType>::transpose_ulisttot()
{
  UlisttotHelper<typename DeviceType::array_layout,decltype(ulisttot_lr),decltype(ulisttot)>::transpose(ulisttot_lr,ulisttot);
}

/* ----------------------------------------------------------------------
   compute derivatives of Wigner U-functions for one neighbor
   see comments in compute_uarray()
@@ -1318,6 +1275,8 @@ double SNAKokkos<DeviceType>::memory_usage()

  bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
  bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot
  if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
    bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot_lr
  bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
                                                       
  bytes += natom * idxz_max * sizeof(double) * 2;        // zlist
@@ -1329,8 +1288,8 @@ double SNAKokkos<DeviceType>::memory_usage()
  bytes += jdim * jdim * jdim * sizeof(int);             // idxz_block
  bytes += jdim * jdim * jdim * sizeof(int);             // idxb_block

  bytes += idxz_max * sizeof(SNAKK_ZINDICES);            // idxz
  bytes += idxb_max * sizeof(SNAKK_BINDICES);            // idxb
  bytes += idxz_max * 10 * sizeof(int);                  // idxz
  bytes += idxb_max * 3 * sizeof(int);                   // idxb

  bytes += jdim * sizeof(double);                        // bzero