Unverified Commit 95c3d2fc authored by Stan Moore's avatar Stan Moore Committed by GitHub
Browse files

Merge pull request #1051 from stanmoore1/data_dup

Add data duplication option to the KOKKOS package
parents 510e09f4 165fa01a
Loading
Loading
Loading
Loading
+21 −0
Original line number Diff line number Diff line
@@ -106,6 +106,11 @@ modification to the input script is needed. Alternatively, one can run
with the KOKKOS package by editing the input script as described
below.

NOTE: When using a single OpenMP thread, the Kokkos Serial backend (i.e. 
Makefile.kokkos_mpi_only) will give better performance than the OpenMP 
backend (i.e. Makefile.kokkos_omp) because some of the overhead to make 
the code thread-safe is removed. 

NOTE: The default for the "package kokkos"_package.html command is to
use "full" neighbor lists and set the Newton flag to "off" for both
pairwise and bonded interactions. However, when running on CPUs, it
@@ -122,6 +127,22 @@ mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half c
If the "newton"_newton.html command is used in the input
script, it can also override the Newton flag defaults.

For half neighbor lists and OpenMP, the KOKKOS package uses data 
duplication (i.e. thread-private arrays) by default to avoid 
thread-level write conflicts in the force arrays (and other data 
structures as necessary). Data duplication is typically fastest for 
small numbers of threads (i.e. 8 or less) but does increase memory 
footprint and is not scalable to large numbers of threads. An 
alternative to data duplication is to use thread-level atomics, which 
don't require duplication. The use of atomics can be forced by compiling 
with the "-DLMP_KOKKOS_USE_ATOMICS" compile switch. Most but not all 
Kokkos-enabled pair_styles support data duplication. Alternatively, full 
neighbor lists avoid the need for duplication or atomics but require 
more compute operations per atom. When using the Kokkos Serial backend 
or the OpenMP backend with a single thread, no duplication or atomics are 
used. For CUDA and half neighbor lists, the KOKKOS package always uses 
atomics.

[Core and Thread Affinity:]

When using multi-threading, it is important for performance to bind
+38 −11
Original line number Diff line number Diff line
@@ -247,6 +247,13 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
  k_t.template modify<LMPHostType>();
  k_t.template sync<DeviceType>();

  need_dup = lmp->kokkos->need_dup<DeviceType>();

  if (need_dup)
    dup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated> (d_o); // allocate duplicated memory
  else
    ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);

  // 1st cg solve over b_s, s
  cg_solve1();

@@ -262,6 +269,10 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)

  if (!allocated_flag)
    allocated_flag = 1;

  // free duplicated memory
  if (need_dup)
    dup_o = decltype(dup_o)();
}

/* ---------------------------------------------------------------------- */
@@ -480,10 +491,12 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve1()
    if (neighflag == HALF) {
      FixQEqReaxKokkosSparse13Functor<DeviceType,HALF> sparse13_functor(this);
      Kokkos::parallel_for(inum,sparse13_functor);
    } else {
    } else if (neighflag == HALFTHREAD) {
      FixQEqReaxKokkosSparse13Functor<DeviceType,HALFTHREAD> sparse13_functor(this);
      Kokkos::parallel_for(inum,sparse13_functor);
    }
    if (need_dup)
      Kokkos::Experimental::contribute(d_o, dup_o);
  } else {
    Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec1> (inum, teamsize), *this);
  }
@@ -531,18 +544,21 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve1()
    Kokkos::parallel_for(inum,sparse22_functor);
    if (neighflag != FULL) {
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this);
      if (need_dup)
        dup_o.reset_except(d_o);
      if (neighflag == HALF) {
        FixQEqReaxKokkosSparse23Functor<DeviceType,HALF> sparse23_functor(this);
        Kokkos::parallel_for(inum,sparse23_functor);
      } else {
      } else if (neighflag == HALFTHREAD) {
        FixQEqReaxKokkosSparse23Functor<DeviceType,HALFTHREAD> sparse23_functor(this);
        Kokkos::parallel_for(inum,sparse23_functor);
      }
      if (need_dup)
        Kokkos::Experimental::contribute(d_o, dup_o);
    } else {
      Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec2> (inum, teamsize), *this);
    }


    if (neighflag != FULL) {
      k_o.template modify<DeviceType>();
      k_o.template sync<LMPHostType>();
@@ -607,13 +623,17 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve2()
  Kokkos::parallel_for(inum,sparse32_functor);
  if (neighflag != FULL) {
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this);
    if (need_dup)
      dup_o.reset_except(d_o);
    if (neighflag == HALF) {
      FixQEqReaxKokkosSparse33Functor<DeviceType,HALF> sparse33_functor(this);
      Kokkos::parallel_for(inum,sparse33_functor);
    } else {
    } else if (neighflag == HALFTHREAD) {
      FixQEqReaxKokkosSparse33Functor<DeviceType,HALFTHREAD> sparse33_functor(this);
      Kokkos::parallel_for(inum,sparse33_functor);
    }
    if (need_dup)
      Kokkos::Experimental::contribute(d_o, dup_o);
  } else {
    Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec3> (inum, teamsize), *this);
  }
@@ -661,13 +681,17 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve2()
    Kokkos::parallel_for(inum,sparse22_functor);
    if (neighflag != FULL) {
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this);
      if (need_dup)
        dup_o.reset_except(d_o);
      if (neighflag == HALF) {
        FixQEqReaxKokkosSparse23Functor<DeviceType,HALF> sparse23_functor(this);
        Kokkos::parallel_for(inum,sparse23_functor);
      } else {
      } else if (neighflag == HALFTHREAD) {
        FixQEqReaxKokkosSparse23Functor<DeviceType,HALFTHREAD> sparse23_functor(this);
        Kokkos::parallel_for(inum,sparse23_functor);
      }
      if (need_dup)
        Kokkos::Experimental::contribute(d_o, dup_o);
    } else {
      Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec2> (inum, teamsize), *this);
    }
@@ -779,8 +803,9 @@ template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void FixQEqReaxKokkos<DeviceType>::sparse13_item(int ii) const
{
  // The q array is atomic for Half/Thread neighbor style
  Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o;
  // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
  auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o);
  auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();

  const int i = d_ilist[ii];
  if (mask[i] & groupbit) {
@@ -831,8 +856,9 @@ template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void FixQEqReaxKokkos<DeviceType>::sparse23_item(int ii) const
{
  // The q array is atomic for Half/Thread neighbor style
  Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o;
  // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
  auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o);
  auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();

  const int i = d_ilist[ii];
  if (mask[i] & groupbit) {
@@ -890,8 +916,9 @@ template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void FixQEqReaxKokkos<DeviceType>::sparse33_item(int ii) const
{
  // The q array is atomic for Half/Thread neighbor style
  Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o;
  // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
  auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o);
  auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();

  const int i = d_ilist[ii];
  if (mask[i] & groupbit) {
+4 −0
Original line number Diff line number Diff line
@@ -148,6 +148,7 @@ class FixQEqReaxKokkos : public FixQEqReax {
 private:
  int inum;
  int allocated_flag;
  int need_dup;

  typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
  Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
@@ -192,6 +193,9 @@ class FixQEqReaxKokkos : public FixQEqReax {
  HAT::t_ffloat_2d h_s_hist, h_t_hist;
  typename AT::t_ffloat_2d_randomread r_s_hist, r_t_hist;

  Kokkos::Experimental::ScatterView<F_FLOAT*, typename AT::t_ffloat_1d::array_layout, DeviceType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated> dup_o;
  Kokkos::Experimental::ScatterView<F_FLOAT*, typename AT::t_ffloat_1d::array_layout, DeviceType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> ndup_o;

  void init_shielding_k();
  void init_hist();
  void allocate_matrix();
+7 −0
Original line number Diff line number Diff line
@@ -166,6 +166,13 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  }
#endif

#ifndef KOKKOS_HAVE_SERIAL
  if (num_threads == 1)
    error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
                         "(i.e. Makefile.kokkos_mpi_only) gives better performance "
                         "than the OpenMP backend");
#endif

  Kokkos::InitArguments args;
  args.num_threads = num_threads;
  args.num_numa = numa;
+13 −0
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@

#include "pointers.h"
#include "kokkos_type.h"
#include "pair_kokkos.h"

namespace LAMMPS_NS {

@@ -40,6 +41,18 @@ class KokkosLMP : protected Pointers {
  ~KokkosLMP();
  void accelerator(int, char **);
  int neigh_count(int);

  template<class DeviceType>
  int need_dup()
  {
    int value = 0;
  
    if (neighflag == HALFTHREAD)
      value = NeedDup<HALFTHREAD,DeviceType>::value;
  
    return value;
  }

 private:
  static void my_signal_handler(int);
};
Loading