Remove hardcoded GPU-direct in KOKKOS package (d8aa6d53) · Commits · 郑智淋 / lammps

doc/src/Speed_kokkos.txt

+3 −6

Original line number	Diff line number	Diff line
		@@ -102,12 +102,9 @@ the case, especially when using pre-compiled MPI libraries provided by
		a Linux distribution. This is not a problem when using only a single
		GPU and a single MPI rank on a desktop. When running with multiple
		MPI ranks, you may see segmentation faults without GPU-direct support.
		Many of those can be avoided by adding the flags '-pk kokkos comm no'
		to the LAMMPS command line or using "package kokkos comm on"_package.html
		in the input file, however for some KOKKOS enabled styles like
		"EAM"_pair_eam.html or "PPPM"_kspace_style.html, this is not the case
		and a GPU-direct enabled MPI library is REQUIRED.

		These can be avoided by adding the flags '-pk kokkos comm no gpu/direct no'
		to the LAMMPS command line or using "package kokkos comm no gpu/direct no"_package.html
		in the input file.

		Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
		/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as

doc/src/package.txt

+16 −6

Original line number	Diff line number	Diff line
		@@ -84,6 +84,9 @@ args = arguments specific to the style :l
		no = perform communication pack/unpack in non-KOKKOS mode
		host = perform pack/unpack on host (e.g. with OpenMP threading)
		device = perform pack/unpack on device (e.g. on GPU)
		{gpu/direct} = {off} or {on}
		off = do not use GPU-direct
		on = use GPU-direct (default)
		{omp} args = Nthreads keyword value ...
		Nthread = # of OpenMP threads to associate with each MPI process
		zero or more keyword/value pairs may be appended
		@@ -505,6 +508,13 @@ typically faster to let the host handle communication, by using the
		{host} value. Using {host} instead of {no} will enable use of
		multiple threads to pack/unpack communicated data.

		The {gpu/direct} keyword chooses whether GPU-direct will be used. When
		this keyword is set to {on}, buffers in GPU memory are passed directly
		through MPI send/receive calls. This reduces overhead of first copying
		the data to the host CPU. However GPU-direct is not supported on all
		systems, which can lead to segmentation faults and would require
		using a value of {off}.

		:line

		The {omp} style invokes settings associated with the use of the
		@@ -611,12 +621,12 @@ is used. If it is not used, you must invoke the package intel
		command in your input script or or via the "-pk intel" "command-line
		switch"_Section_start.html#start_6.

		For the KOKKOS package, the option defaults neigh = full,
		neigh/qeq = full, newton = off, binsize = 0.0, and comm = device.
		These settings are made automatically by the required "-k on" "command-line
		switch"_Section_start.html#start_6. You can change them bu using the
		package kokkos command in your input script or via the "-pk kokkos"
		"command-line switch"_Section_start.html#start_6.
		For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
		full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
		These settings are made automatically by the required "-k on"
		"command-line switch"_Section_start.html#start_6. You can change them bu
		using the package kokkos command in your input script or via the "-pk
		kokkos" "command-line switch"_Section_start.html#start_6.

		For the OMP package, the default is Nthreads = 0 and the option
		defaults are neigh = yes. These settings are made automatically if

src/KOKKOS/comm_kokkos.cpp

+21 −3

Original line number	Diff line number	Diff line
		@@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
		// if self, set recv buffer to send buffer

		if (sendproc[iswap] != me) {
		if (recvnum[iswap])
		MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE,
		double* buf_send_pair;
		double* buf_recv_pair;
		if (lmp->kokkos->gpu_direct) {
		buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
		buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
		} else {
		k_buf_send_pair.modify<DeviceType>();
		k_buf_send_pair.sync<LMPHostType>();
		buf_send_pair = k_buf_send_pair.h_view.data();
		buf_recv_pair = k_buf_recv_pair.h_view.data();
		}

		if (recvnum[iswap]) {
		MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE,
		recvproc[iswap],0,world,&request);
		}
		if (sendnum[iswap])
		MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world);
		MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
		if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);

		if (!lmp->kokkos->gpu_direct) {
		k_buf_recv_pair.modify<LMPHostType>();
		k_buf_recv_pair.sync<DeviceType>();
		}
		} else k_buf_recv_pair = k_buf_send_pair;

		// unpack buffer

src/KOKKOS/gridcomm_kokkos.cpp

+39 −4

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@
		#include "memory_kokkos.h"
		#include "error.h"
		#include "kokkos_base.h"
		#include "kokkos.h"

		using namespace LAMMPS_NS;

		@@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
		DeviceType::fence();

		if (swap[m].sendproc != me) {
		MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR,
		MPI_FFT_SCALAR* buf1;
		MPI_FFT_SCALAR* buf2;
		if (lmp->kokkos->gpu_direct) {
		buf1 = k_buf1.view<DeviceType>().data();
		buf2 = k_buf2.view<DeviceType>().data();
		} else {
		k_buf1.modify<DeviceType>();
		k_buf1.sync<LMPHostType>();
		buf1 = k_buf1.h_view.data();
		buf2 = k_buf2.h_view.data();
		}

		MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
		swap[m].recvproc,0,gridcomm,&request);
		MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR,
		MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
		swap[m].sendproc,0,gridcomm);
		MPI_Wait(&request,MPI_STATUS_IGNORE);

		if (!lmp->kokkos->gpu_direct) {
		k_buf2.modify<LMPHostType>();
		k_buf2.sync<DeviceType>();
		}
		}

		kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
		@@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
		DeviceType::fence();

		if (swap[m].recvproc != me) {
		MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR,
		MPI_FFT_SCALAR* buf1;
		MPI_FFT_SCALAR* buf2;
		if (lmp->kokkos->gpu_direct) {
		buf1 = k_buf1.view<DeviceType>().data();
		buf2 = k_buf2.view<DeviceType>().data();
		} else {
		k_buf1.modify<DeviceType>();
		k_buf1.sync<LMPHostType>();
		buf1 = k_buf1.h_view.data();
		buf2 = k_buf2.h_view.data();
		}

		MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
		swap[m].sendproc,0,gridcomm,&request);
		MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
		MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
		swap[m].recvproc,0,gridcomm);
		MPI_Wait(&request,MPI_STATUS_IGNORE);

		if (!lmp->kokkos->gpu_direct) {
		k_buf2.modify<LMPHostType>();
		k_buf2.sync<DeviceType>();
		}
		}

		kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);

src/KOKKOS/kokkos.cpp

+10 −2

Original line number	Diff line number	Diff line
		@@ -156,11 +156,11 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		} else if (-1 == have_gpu_direct() ) {
		error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available,"
		" but cannot determine if this is the case\n try"
		" '-pk kokkos comm no' when getting segmentation faults");
		" '-pk kokkos comm no gpu/direct no' when getting segmentation faults");
		} else if ( 0 == have_gpu_direct() ) {
		error->warning(FLERR,"GPU-direct is NOT available, but some parts of "
		"Kokkos with CUDA require it\n try"
		" '-pk kokkos comm no' when getting segmentation faults");
		" '-pk kokkos comm no gpu/direct no' when getting segmentation faults");
		} else {
		; // should never get here
		}
		@@ -186,6 +186,7 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		exchange_comm_on_host = 0;
		forward_comm_on_host = 0;
		reverse_comm_on_host = 0;
		gpu_direct = 0;

		#ifdef KILL_KOKKOS_ON_SIGSEGV
		signal(SIGSEGV, my_signal_handler);
		@@ -216,6 +217,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
		double binsize = 0.0;
		exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
		exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
		gpu_direct = 1;

		int iarg = 0;
		while (iarg < narg) {
		@@ -299,6 +301,12 @@ void KokkosLMP::accelerator(int narg, char **arg)
		reverse_comm_on_host = 0;
		} else error->all(FLERR,"Illegal package kokkos command");
		iarg += 2;
		} else if (strcmp(arg[iarg],"gpu/direct") == 0) {
		if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
		if (strcmp(arg[iarg+1],"off") == 0) gpu_direct = 0;
		else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct = 1;
		else error->all(FLERR,"Illegal package kokkos command");
		iarg += 2;
		} else error->all(FLERR,"Illegal package kokkos command");
		}

Admin message