Merge pull request #1496 from akkamesh/enh-ext-reaxc (fe295727) · Commits · 郑智淋 / lammps

src/KOKKOS/fix_qeq_reax_kokkos.cpp

+252 −4

Original line number	Diff line number	Diff line
		@@ -12,7 +12,8 @@
		------------------------------------------------------------------------- */

		/* ----------------------------------------------------------------------
		Contributing author: Ray Shan (SNL), Stan Moore (SNL)
		Contributing authors: Ray Shan (SNL), Stan Moore (SNL),
		Kamesh Arumugam (NVIDIA)
		------------------------------------------------------------------------- */

		#include <cmath>
		@@ -68,6 +69,8 @@ FixQEqReaxKokkos(LAMMPS lmp, int narg, char *arg) :
		memory->destroy(s_hist);
		memory->destroy(t_hist);
		grow_arrays(atom->nmax);

		d_mfill_offset = typename AT::t_int_scalar("qeq/kk:mfill_offset");
		}

		/* ---------------------------------------------------------------------- */
		@@ -217,17 +220,46 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
		copymode = 1;

		// allocate

		allocate_array();

		// get max number of neighbor

		if (!allocated_flag \|\| update->ntimestep == neighbor->lastcall)
		allocate_matrix();

		// compute_H
		FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);

		if (lmp->kokkos->ngpus == 0) { // CPU
		if (neighflag == FULL) {
		FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
		Kokkos::parallel_scan(inum,computeH_functor);
		} else { // HALF and HALFTHREAD are the same
		FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
		Kokkos::parallel_scan(inum,computeH_functor);
		}
		} else { // GPU, use teams
		Kokkos::deep_copy(d_mfill_offset,0);

		int vector_length = 32;
		int atoms_per_team = 4;
		int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);

		Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
		vector_length);
		if (neighflag == FULL) {
		FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
		this, atoms_per_team, vector_length);
		Kokkos::parallel_for(policy, computeH_functor);
		} else { // HALF and HALFTHREAD are the same
		FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
		this, atoms_per_team, vector_length);
		Kokkos::parallel_for(policy, computeH_functor);
		}
		}

		// init_matvec

		k_s_hist.template sync<DeviceType>();
		k_t_hist.template sync<DeviceType>();
		FixQEqReaxKokkosMatVecFunctor<DeviceType> matvec_functor(this);
		@@ -257,12 +289,15 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
		ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);

		// 1st cg solve over b_s, s

		cg_solve1();

		// 2nd cg solve over b_t, t

		cg_solve2();

		// calculate_Q();

		calculate_q();
		k_s_hist.template modify<DeviceType>();
		k_t_hist.template modify<DeviceType>();
		@@ -273,6 +308,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
		allocated_flag = 1;

		// free duplicated memory

		if (need_dup)
		dup_o = decltype(dup_o)();

		@@ -377,6 +413,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
		/* ---------------------------------------------------------------------- */

		template<class DeviceType>
		template <int NEIGHFLAG>
		KOKKOS_INLINE_FUNCTION
		void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
		{
		@@ -403,7 +440,7 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
		const X_FLOAT dely = x(j,1) - ytmp;
		const X_FLOAT delz = x(j,2) - ztmp;

		if (neighflag != FULL) {
		if (NEIGHFLAG != FULL) {
		// skip half of the interactions
		const tagint jtag = tag(j);
		if (j >= nlocal) {
		@@ -437,6 +474,217 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo

		/* ---------------------------------------------------------------------- */

		// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
		// d_val - contains the non-zero entries of sparse matrix H
		// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
		// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
		// d_jlist - contains the column index corresponding to each entry in d_val

		template <class DeviceType>
		template <int NEIGHFLAG>
		void FixQEqReaxKokkos<DeviceType>::compute_h_team(
		const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
		int atoms_per_team, int vector_length) const {

		// scratch space setup
		Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_ilist(team.team_shmem(), atoms_per_team);
		Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_numnbrs(team.team_shmem(), atoms_per_team);
		Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_firstnbr(team.team_shmem(), atoms_per_team);

		Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_jtype(team.team_shmem(), atoms_per_team, vector_length);
		Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_jlist(team.team_shmem(), atoms_per_team, vector_length);
		Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
		Kokkos::MemoryTraits<Kokkos::Unmanaged>>
		s_r(team.team_shmem(), atoms_per_team, vector_length);

		// team of threads work on atoms with index in [firstatom, lastatom)
		int firstatom = team.league_rank() * atoms_per_team;
		int lastatom =
		(firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;

		// kokkos-thread-0 is used to load info from global memory into scratch space
		if (team.team_rank() == 0) {

		// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
		// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
		// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
		// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])

		Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
		[&](const int &idx, int &totalnbrs, bool final) {
		int ii = firstatom + idx;

		if (ii < inum) {
		const int i = d_ilist[ii];
		int jnum = d_numneigh[i];

		if (final) {
		s_ilist[idx] = i;
		s_numnbrs[idx] = jnum;
		s_firstnbr[idx] = totalnbrs;
		}
		totalnbrs += jnum;
		} else {
		s_numnbrs[idx] = 0;
		}
		});
		}

		// barrier ensures that the data moved to scratch space is visible to all the
		// threads of the corresponding team
		team.team_barrier();

		// calculate the global memory offset from where the H matrix values to be
		// calculated by the current team will be stored in d_val
		int team_firstnbr_idx = 0;
		Kokkos::single(Kokkos::PerTeam(team),
		[=](int &val) {
		int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
		s_numnbrs[lastatom - firstatom - 1];
		val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
		},
		team_firstnbr_idx);

		// map the H matrix computation of each atom to kokkos-thread (one atom per
		// kokkos-thread) neighbor computation for each atom is assigned to vector
		// lanes of the corresponding thread
		Kokkos::parallel_for(
		Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
		int ii = firstatom + idx;

		if (ii < inum) {
		const int i = s_ilist[idx];

		if (mask[i] & groupbit) {
		const X_FLOAT xtmp = x(i, 0);
		const X_FLOAT ytmp = x(i, 1);
		const X_FLOAT ztmp = x(i, 2);
		const int itype = type(i);
		const tagint itag = tag(i);
		const int jnum = s_numnbrs[idx];

		// calculate the write-offset for atom-i's first neighbor
		int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
		Kokkos::single(Kokkos::PerThread(team),
		[&]() { d_firstnbr[i] = atomi_firstnbr_idx; });

		// current # of neighbor atoms with non-zero electrostatic
		// interaction coefficients with atom-i which represents the # of
		// non-zero elements in row-i of H matrix
		int atomi_nbrs_inH = 0;

		// calculate H matrix values corresponding to atom-i where neighbors
		// are processed in batches and the batch size is vector_length
		for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {

		int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;

		// count the # of neighbor atoms with non-zero electrostatic
		// interaction coefficients with atom-i in the current batch
		int atomi_nbrs_curbatch = 0;

		// compute rsq, jtype, j and store in scratch space which is
		// reused later
		Kokkos::parallel_reduce(
		Kokkos::ThreadVectorRange(team, vector_length),
		[&](const int &idx, int &m_fill) {
		const int jj = jj_start + idx;

		// initialize: -1 represents no interaction with atom-j
		// where j = d_neighbors(i,jj)
		s_jlist(team.team_rank(), idx) = -1;

		if (jj < jnum) {
		int j = d_neighbors(i, jj);
		j &= NEIGHMASK;
		const int jtype = type(j);

		const X_FLOAT delx = x(j, 0) - xtmp;
		const X_FLOAT dely = x(j, 1) - ytmp;
		const X_FLOAT delz = x(j, 2) - ztmp;

		// valid nbr interaction
		bool valid = true;
		if (NEIGHFLAG != FULL) {
		// skip half of the interactions
		const tagint jtag = tag(j);
		if (j >= nlocal) {
		if (itag > jtag) {
		if ((itag + jtag) % 2 == 0)
		valid = false;
		} else if (itag < jtag) {
		if ((itag + jtag) % 2 == 1)
		valid = false;
		} else {
		if (x(j, 2) < ztmp)
		valid = false;
		if (x(j, 2) == ztmp && x(j, 1) < ytmp)
		valid = false;
		if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
		x(j, 0) < xtmp)
		valid = false;
		}
		}
		}

		const F_FLOAT rsq =
		delx * delx + dely * dely + delz * delz;
		if (rsq > cutsq)
		valid = false;

		if (valid) {
		s_jlist(team.team_rank(), idx) = j;
		s_jtype(team.team_rank(), idx) = jtype;
		s_r(team.team_rank(), idx) = sqrt(rsq);
		m_fill++;
		}
		}
		},
		atomi_nbrs_curbatch);

		// write non-zero entries of H to global memory
		Kokkos::parallel_scan(
		Kokkos::ThreadVectorRange(team, vector_length),
		[&](const int &idx, int &m_fill, bool final) {
		int j = s_jlist(team.team_rank(), idx);
		if (final) {
		if (j != -1) {
		const int jtype = s_jtype(team.team_rank(), idx);
		const F_FLOAT r = s_r(team.team_rank(), idx);
		const F_FLOAT shldij = d_shield(itype, jtype);

		d_jlist[atomi_nbr_writeIdx + m_fill] = j;
		d_val[atomi_nbr_writeIdx + m_fill] =
		calculate_H_k(r, shldij);
		}
		}

		if (j != -1) {
		m_fill++;
		}
		});
		atomi_nbrs_inH += atomi_nbrs_curbatch;
		}

		Kokkos::single(Kokkos::PerThread(team),
		[&]() { d_numnbrs[i] = atomi_nbrs_inH; });
		}
		}
		});
		}

		/* ---------------------------------------------------------------------- */

		template<class DeviceType>
		KOKKOS_INLINE_FUNCTION
		double FixQEqReaxKokkos<DeviceType>::calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const

src/KOKKOS/fix_qeq_reax_kokkos.h

+46 −4

Original line number	Diff line number	Diff line
		@@ -53,9 +53,14 @@ class FixQEqReaxKokkos : public FixQEqReax {
		KOKKOS_INLINE_FUNCTION
		void zero_item(int) const;

		template<int NEIGHFLAG>
		KOKKOS_INLINE_FUNCTION
		void compute_h_item(int, int &, const bool &) const;

		template<int NEIGHFLAG>
		KOKKOS_INLINE_FUNCTION
		void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;

		KOKKOS_INLINE_FUNCTION
		void matvec_item(int) const;

		@@ -150,6 +155,8 @@ class FixQEqReaxKokkos : public FixQEqReax {
		int allocated_flag;
		int need_dup;

		typename AT::t_int_scalar d_mfill_offset;

		typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
		Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
		typename Kokkos::DualView<params_qeq*, Kokkos::LayoutRight,DeviceType>::t_dev_const params;
		@@ -247,16 +254,51 @@ struct FixQEqReaxKokkosMatVecFunctor {
		}
		};

		template <class DeviceType>
		template <class DeviceType, int NEIGHFLAG>
		struct FixQEqReaxKokkosComputeHFunctor {
		typedef DeviceType device_type ;
		int atoms_per_team, vector_length;
		typedef int value_type;
		typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
		FixQEqReaxKokkos<DeviceType> c;

		FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {
		c.cleanup_copy();
		};

		FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
		int _atoms_per_team, int _vector_length)
		: c(*c_ptr), atoms_per_team(_atoms_per_team),
		vector_length(_vector_length) {
		c.cleanup_copy();
		};

		KOKKOS_INLINE_FUNCTION
		void operator()(const int ii, int &m_fill, const bool &final) const {
		c.compute_h_item(ii,m_fill,final);
		c.template compute_h_item<NEIGHFLAG>(ii,m_fill,final);
		}

		KOKKOS_INLINE_FUNCTION
		void operator()(
		const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {
		c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
		}

		size_t team_shmem_size(int team_size) const {
		size_t shmem_size =
		Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
		atoms_per_team) + // s_ilist
		Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
		atoms_per_team) + // s_numnbrs
		Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
		atoms_per_team) + // s_firstnbr
		Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
		shmem_size(atoms_per_team, vector_length) + // s_jtype
		Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
		shmem_size(atoms_per_team, vector_length) + // s_j
		Kokkos::View<F_FLOAT **, scratch_space,
		Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team,
		vector_length); // s_r
		return shmem_size;
		}
		};

src/KOKKOS/kokkos.cpp

+16 −16

Original line number	Diff line number	Diff line
		@@ -78,9 +78,9 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)

		// process any command-line args that invoke Kokkos settings

		ngpu = 0;
		ngpus = 0;
		int device = 0;
		num_threads = 1;
		nthreads = 1;
		numa = 1;

		int iarg = 0;
		@@ -96,7 +96,7 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		error->all(FLERR,"GPUs are requested but Kokkos has not been compiled for CUDA");
		#endif
		if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
		ngpu = atoi(arg[iarg+1]);
		ngpus = atoi(arg[iarg+1]);

		int skip_gpu = 9999;
		if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
		@@ -108,23 +108,23 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		char *str;
		if ((str = getenv("SLURM_LOCALID"))) {
		int local_rank = atoi(str);
		device = local_rank % ngpu;
		device = local_rank % ngpus;
		if (device >= skip_gpu) device++;
		}
		if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
		int local_rank = atoi(str);
		device = local_rank % ngpu;
		device = local_rank % ngpus;
		if (device >= skip_gpu) device++;
		}
		if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
		int local_rank = atoi(str);
		device = local_rank % ngpu;
		device = local_rank % ngpus;
		if (device >= skip_gpu) device++;
		}

		} else if (strcmp(arg[iarg],"t") == 0 \|\|
		strcmp(arg[iarg],"threads") == 0) {
		num_threads = atoi(arg[iarg+1]);
		nthreads = atoi(arg[iarg+1]);
		iarg += 2;

		} else if (strcmp(arg[iarg],"n") == 0 \|\|
		@@ -138,12 +138,12 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		// initialize Kokkos

		if (me == 0) {
		if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpu);
		if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpu);
		if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpus);
		if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpus);
		}

		#ifdef KOKKOS_ENABLE_CUDA
		if (ngpu <= 0)
		if (ngpus <= 0)
		error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");

		// check and warn about GPU-direct availability when using multiple MPI tasks
		@@ -167,14 +167,14 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		#endif

		#ifndef KOKKOS_ENABLE_SERIAL
		if (num_threads == 1)
		if (nthreads == 1)
		error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
		"(i.e. Makefile.kokkos_mpi_only) gives better performance "
		"than the OpenMP backend");
		#endif

		Kokkos::InitArguments args;
		args.num_threads = num_threads;
		args.num_threads = nthreads;
		args.num_numa = numa;
		args.device_id = device;

		@@ -187,14 +187,14 @@ KokkosLMP::KokkosLMP(LAMMPS lmp, int narg, char *arg) : Pointers(lmp)
		neigh_thread = 0;
		neigh_thread_set = 0;
		neighflag_qeq_set = 0;
		if (ngpu > 0) {
		if (ngpus > 0) {
		neighflag = FULL;
		neighflag_qeq = FULL;
		newtonflag = 0;
		exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
		exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
		} else {
		if (num_threads > 1) {
		if (nthreads > 1) {
		neighflag = HALFTHREAD;
		neighflag_qeq = HALFTHREAD;
		} else {
		@@ -237,7 +237,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
		if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
		if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
		else if (strcmp(arg[iarg+1],"half") == 0) {
		if (num_threads > 1 \|\| ngpu > 0)
		if (nthreads > 1 \|\| ngpus > 0)
		neighflag = HALFTHREAD;
		else
		neighflag = HALF;
		@@ -249,7 +249,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
		if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
		if (strcmp(arg[iarg+1],"full") == 0) neighflag_qeq = FULL;
		else if (strcmp(arg[iarg+1],"half") == 0) {
		if (num_threads > 1 \|\| ngpu > 0)
		if (nthreads > 1 \|\| ngpus > 0)
		neighflag_qeq = HALFTHREAD;
		else
		neighflag_qeq = HALF;

src/KOKKOS/kokkos.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -32,7 +32,7 @@ class KokkosLMP : protected Pointers {
		int exchange_comm_on_host;
		int forward_comm_on_host;
		int reverse_comm_on_host;
		int num_threads,ngpu;
		int nthreads,ngpus;
		int numa;
		int auto_sync;
		int gpu_direct_flag;

src/KOKKOS/neighbor_kokkos.cpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -362,7 +362,7 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){

		/* ---------------------------------------------------------------------- */
		void NeighborKokkos::set_binsize_kokkos() {
		if (!binsizeflag && lmp->kokkos->ngpu > 0) {
		if (!binsizeflag && lmp->kokkos->ngpus > 0) {
		binsize_user = cutneighmax;
		binsizeflag = 1;
		}

Admin message