Commit d8aa6d53 authored by Stan Moore's avatar Stan Moore
Browse files

Remove hardcoded GPU-direct in KOKKOS package

parent 64e152bc
Loading
Loading
Loading
Loading
+3 −6
Original line number Diff line number Diff line
@@ -102,12 +102,9 @@ the case, especially when using pre-compiled MPI libraries provided by
a Linux distribution. This is not a problem when using only a single
GPU and a single MPI rank on a desktop. When running with multiple
MPI ranks, you may see segmentation faults without GPU-direct support.
Many of those can be avoided by adding the flags '-pk kokkos comm no'
to the LAMMPS command line or using "package kokkos comm on"_package.html
in the input file, however for some KOKKOS enabled styles like 
"EAM"_pair_eam.html or "PPPM"_kspace_style.html, this is not the case
and a GPU-direct enabled MPI library is REQUIRED.

These can be avoided by adding the flags '-pk kokkos comm no gpu/direct no'
to the LAMMPS command line or using "package kokkos comm no gpu/direct no"_package.html
in the input file.

Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as
+16 −6
Original line number Diff line number Diff line
@@ -84,6 +84,9 @@ args = arguments specific to the style :l
        no = perform communication pack/unpack in non-KOKKOS mode
        host = perform pack/unpack on host (e.g. with OpenMP threading)
        device = perform pack/unpack on device (e.g. on GPU)
      {gpu/direct} = {off} or {on}
        off = do not use GPU-direct
        on = use GPU-direct (default)
  {omp} args = Nthreads keyword value ...
    Nthread = # of OpenMP threads to associate with each MPI process
    zero or more keyword/value pairs may be appended
@@ -505,6 +508,13 @@ typically faster to let the host handle communication, by using the
{host} value.  Using {host} instead of {no} will enable use of
multiple threads to pack/unpack communicated data.

The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
this keyword is set to {on}, buffers in GPU memory are passed directly 
through MPI send/receive calls. This reduces overhead of first copying 
the data to the host CPU. However GPU-direct is not supported on all 
systems, which can lead to segmentation faults and would require
using a value of {off}. 

:line

The {omp} style invokes settings associated with the use of the
@@ -611,12 +621,12 @@ is used. If it is not used, you must invoke the package intel
command in your input script or or via the "-pk intel" "command-line
switch"_Section_start.html#start_6.

For the KOKKOS package, the option defaults neigh = full,
neigh/qeq = full, newton = off, binsize = 0.0, and comm = device.
These settings are made automatically by the required "-k on" "command-line
switch"_Section_start.html#start_6.  You can change them bu using the
package kokkos command in your input script or via the "-pk kokkos"
"command-line switch"_Section_start.html#start_6.
For the KOKKOS package, the option defaults neigh = full, neigh/qeq = 
full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on. 
These settings are made automatically by the required "-k on" 
"command-line switch"_Section_start.html#start_6. You can change them bu 
using the package kokkos command in your input script or via the "-pk 
kokkos" "command-line switch"_Section_start.html#start_6. 

For the OMP package, the default is Nthreads = 0 and the option
defaults are neigh = yes.  These settings are made automatically if
+21 −3
Original line number Diff line number Diff line
@@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
      if (recvnum[iswap])
        MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE,
      double* buf_send_pair;
      double* buf_recv_pair;
      if (lmp->kokkos->gpu_direct) {
        buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
        buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
      } else {
        k_buf_send_pair.modify<DeviceType>();
        k_buf_send_pair.sync<LMPHostType>();
        buf_send_pair = k_buf_send_pair.h_view.data();
        buf_recv_pair = k_buf_recv_pair.h_view.data();
      }

      if (recvnum[iswap]) {
        MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE,
                  recvproc[iswap],0,world,&request);
      }
      if (sendnum[iswap])
        MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world);
        MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
      if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);

      if (!lmp->kokkos->gpu_direct) {
        k_buf_recv_pair.modify<LMPHostType>();
        k_buf_recv_pair.sync<DeviceType>();
      }
    } else k_buf_recv_pair = k_buf_send_pair;

    // unpack buffer
+39 −4
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include "memory_kokkos.h"
#include "error.h"
#include "kokkos_base.h"
#include "kokkos.h"

using namespace LAMMPS_NS;

@@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
    DeviceType::fence();

    if (swap[m].sendproc != me) {
      MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR,
      MPI_FFT_SCALAR* buf1;
      MPI_FFT_SCALAR* buf2;
      if (lmp->kokkos->gpu_direct) {
        buf1 = k_buf1.view<DeviceType>().data();
        buf2 = k_buf2.view<DeviceType>().data();
      } else {
        k_buf1.modify<DeviceType>();
        k_buf1.sync<LMPHostType>();
        buf1 = k_buf1.h_view.data();
        buf2 = k_buf2.h_view.data();
      }

      MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
                swap[m].recvproc,0,gridcomm,&request);
      MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR,
      MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
               swap[m].sendproc,0,gridcomm);
      MPI_Wait(&request,MPI_STATUS_IGNORE);

      if (!lmp->kokkos->gpu_direct) {
        k_buf2.modify<LMPHostType>();
        k_buf2.sync<DeviceType>();
      }
    }

    kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
@@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
    DeviceType::fence();

    if (swap[m].recvproc != me) {
      MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR,
      MPI_FFT_SCALAR* buf1;
      MPI_FFT_SCALAR* buf2;
      if (lmp->kokkos->gpu_direct) {
        buf1 = k_buf1.view<DeviceType>().data();
        buf2 = k_buf2.view<DeviceType>().data();
      } else {
        k_buf1.modify<DeviceType>();
        k_buf1.sync<LMPHostType>();
        buf1 = k_buf1.h_view.data();
        buf2 = k_buf2.h_view.data();
      }

      MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
                swap[m].sendproc,0,gridcomm,&request);
      MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
      MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
               swap[m].recvproc,0,gridcomm);
      MPI_Wait(&request,MPI_STATUS_IGNORE);

      if (!lmp->kokkos->gpu_direct) {
        k_buf2.modify<LMPHostType>();
        k_buf2.sync<DeviceType>();
      }
    }

    kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
+10 −2
Original line number Diff line number Diff line
@@ -156,11 +156,11 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
    } else if (-1 == have_gpu_direct() ) {
      error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available,"
                     " but cannot determine if this is the case\n         try"
                     " '-pk kokkos comm no' when getting segmentation faults");
                     " '-pk kokkos comm no gpu/direct no' when getting segmentation faults");
    } else if ( 0 == have_gpu_direct() ) {
      error->warning(FLERR,"GPU-direct is NOT available, but some parts of "
                     "Kokkos with CUDA require it\n         try"
                     " '-pk kokkos comm no' when getting segmentation faults");
                     " '-pk kokkos comm no gpu/direct no' when getting segmentation faults");
    } else {
      ; // should never get here
    }
@@ -186,6 +186,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  exchange_comm_on_host = 0;
  forward_comm_on_host = 0;
  reverse_comm_on_host = 0;
  gpu_direct = 0;

#ifdef KILL_KOKKOS_ON_SIGSEGV
  signal(SIGSEGV, my_signal_handler);
@@ -216,6 +217,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
  double binsize = 0.0;
  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  gpu_direct = 1;

  int iarg = 0;
  while (iarg < narg) {
@@ -299,6 +301,12 @@ void KokkosLMP::accelerator(int narg, char **arg)
        reverse_comm_on_host = 0;
      } else error->all(FLERR,"Illegal package kokkos command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"gpu/direct") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"off") == 0) gpu_direct = 0;
      else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct = 1;
      else error->all(FLERR,"Illegal package kokkos command");
      iarg += 2;
    } else error->all(FLERR,"Illegal package kokkos command");
  }

Loading