Commit f6a107c4 authored by Stan Moore's avatar Stan Moore
Browse files

Fix team_size issues in pair_snap_kokkos

parent 01a51d65
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -223,7 +223,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)

  // compute_H

  if (lmp->kokkos->ngpus == 0) { // CPU
  if (execution_space == Host) { // CPU
    if (neighflag == FULL) {
      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
      Kokkos::parallel_scan(inum,computeH_functor);
+7 −0
Original line number Diff line number Diff line
@@ -64,6 +64,12 @@ public:
  void compute(int, int);
  double memory_usage();

  template<class TagStyle>
  void check_team_size_for(int, int&, int);

  template<class TagStyle>
  void check_team_size_reduce(int, int&, int);

  template<int NEIGHFLAG, int EVFLAG>
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const;
@@ -131,6 +137,7 @@ protected:
  SNAKokkos<DeviceType> snaKK;

  int inum,max_neighs,chunk_size,chunk_offset;
  int host_flag;

  int eflag,vflag;

+156 −115
Original line number Diff line number Diff line
@@ -61,6 +61,8 @@ PairSNAPKokkos<DeviceType>::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp)
  k_cutsq = tdual_fparams("PairSNAPKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;

  host_flag = (execution_space == Host);
}

/* ---------------------------------------------------------------------- */
@@ -187,13 +189,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  int chunk_size = MIN(2000,inum);
  chunk_offset = 0;

  int vector_length = 1;
  int team_size = 1;
  int team_size_max = Kokkos::TeamPolicy<DeviceType>(chunk_size,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  int vector_length_default = 1;
  int team_size_default = 1;
#ifdef KOKKOS_ENABLE_CUDA
  team_size = 32;//max_neighs;
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
  team_size_default = 32;//max_neighs;
#endif

  if (beta_max < inum) {
@@ -219,41 +218,40 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      chunk_size = inum - chunk_offset;

    //ComputeNeigh
    {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      check_team_size_for<TagPairSNAPComputeNeigh>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }

    //PreUi
    {
      int vector_length = 1;
      int team_size = 1;
      if (lmp->kokkos->ngpus != 0) {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      if (!host_flag)
        vector_length = 32;
        team_size = 32;//max_neighs;
        int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>(chunk_size,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
        if (team_size*vector_length > team_size_max)
          team_size = team_size_max/vector_length;
      }
      check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
      Kokkos::parallel_for("PreUi",policy_preui,*this);
    }

    // ComputeUI
    if (lmp->kokkos->ngpus == 0) { // CPU
    {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      if (host_flag) { // CPU
        // Run a fused calculation of ulist and accumulation into ulisttot using atomics
      int vector_length = 1;
      int team_size = 1;

        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);

        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
      } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics

      // ComputeUi
      int vector_length = 32;
      int team_size = 4; // need to cap b/c of shared memory reqs
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>(chunk_size,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
        vector_length = 32;
        team_size = 4; // need to cap b/c of shared memory reqs
        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);

        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        //   2 is for double buffer
@@ -271,6 +269,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUi",policy_ui,*this);

      }
    }


    //Compute bispectrum
@@ -281,27 +280,29 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      Kokkos::parallel_for("ComputeZi",policy_zi,*this);

      //ComputeBi
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      check_team_size_for<TagPairSNAPComputeBi>(chunk_size,team_size,vector_length);      
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeBi",policy_bi,*this);

    }

    //Compute beta = dE_i/dB_i for all i in list
    {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      check_team_size_for<TagPairSNAPBeta>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPBeta> policy_beta(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
    }

    //ZeroYi
    {
      int vector_length = 1;
      int team_size = 1;
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>(chunk_size,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());

#ifdef KOKKOS_ENABLE_CUDA
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      if (!host_flag)
        team_size = 128;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
#endif

      check_team_size_for<TagPairSNAPZeroYi>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
    }
@@ -312,9 +313,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);

    //ComputeDuidrj and Deidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
    {
      int team_size = team_size_default;
      int vector_length = vector_length_default;
      if (host_flag) { // CPU

        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
        snaKK.set_dir(-1); // technically doesn't do anything
@@ -325,11 +327,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
      } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei

      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>(chunk_size,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
      int vector_length = 32;
      int team_size = 2; // need to cap b/c of shared memory reqs
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
        vector_length = 32;
        team_size = 2; // need to cap b/c of shared memory reqs
        check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);

        // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
        // 2 is for double buffer
@@ -349,29 +349,38 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
          Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
        }
      }
    }  

    //ComputeForce
    {
      int team_size = team_size_default;
      int vector_length = vector_length_default;
      if (eflag) {
        if (neighflag == HALF) {
          check_team_size_reduce<TagPairSNAPComputeForce<HALF,1> >(chunk_size,team_size,vector_length);
          typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(chunk_size,team_size,vector_length);
          Kokkos::parallel_reduce(policy_force
            ,*this,ev_tmp);
        } else if (neighflag == HALFTHREAD) {
          check_team_size_reduce<TagPairSNAPComputeForce<HALFTHREAD,1> >(chunk_size,team_size,vector_length);
          typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(chunk_size,team_size,vector_length);
          Kokkos::parallel_reduce(policy_force
            ,*this,ev_tmp);
        }
      } else {
        if (neighflag == HALF) {
          check_team_size_for<TagPairSNAPComputeForce<HALF,0> >(chunk_size,team_size,vector_length);
          typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(chunk_size,team_size,vector_length);
          Kokkos::parallel_for(policy_force
            ,*this);
        } else if (neighflag == HALFTHREAD) {
          check_team_size_for<TagPairSNAPComputeForce<HALFTHREAD,0> >(chunk_size,team_size,vector_length);
          typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(chunk_size,team_size,vector_length);
          Kokkos::parallel_for(policy_force
            ,*this);
        }
      }
    }
    ev += ev_tmp;
    chunk_offset += chunk_size;

@@ -889,4 +898,36 @@ double PairSNAPKokkos<DeviceType>::memory_usage()
  return bytes;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<class TagStyle>
void PairSNAPKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
  int team_size_max;

  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());

#ifdef KOKKOS_ENABLE_CUDA
  if(team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
#else
  team_size = 1;
#endif
}

template<class DeviceType>
template<class TagStyle>
void PairSNAPKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size, int vector_length) {
  int team_size_max;

  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());

#ifdef KOKKOS_ENABLE_CUDA
  if(team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
#else
  team_size = 1;
#endif
}

}