Commit 708052dc authored by Kamesh AK's avatar Kamesh AK
Browse files

reaxc/qeq optimization - using kokkos hierarchical parallelism

parent 045c312c
Loading
Loading
Loading
Loading
+203 −55
Original line number Diff line number Diff line
@@ -129,6 +129,9 @@ void FixQEqReaxKokkos<DeviceType>::init()

  init_shielding_k();
  init_hist();

  k_mfill_offset = DAT::tdual_int_scalar("reax:k_mfill_offset");
  d_mfill_offset = k_mfill_offset.view<DeviceType>(); 
}

/* ---------------------------------------------------------------------- */
@@ -222,8 +225,31 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
    allocate_matrix();

  // compute_H
  FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);
  Kokkos::parallel_scan(inum,computeH_functor);
  k_mfill_offset.h_view() = 0;
  k_mfill_offset.modify<LMPHostType>();
  k_mfill_offset.sync<DeviceType>();

  int vector_length = 32;
  int atoms_per_team = 4;
  int num_teams = inum/atoms_per_team + (inum%atoms_per_team?1:0);

  Kokkos::TeamPolicy <DeviceType> policy(num_teams, atoms_per_team, vector_length);
  if (neighflag == FULL){
      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this,
									 atoms_per_team,
									 vector_length);
      Kokkos::parallel_for( policy,  computeH_functor );
  }else if (neighflag == HALF){
      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this,
									 atoms_per_team,
									 vector_length);
      Kokkos::parallel_for( policy,  computeH_functor );
  }else {
      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALFTHREAD> computeH_functor(this,
									       atoms_per_team,
									       vector_length);
      Kokkos::parallel_for( policy,  computeH_functor );
  }

  // init_matvec
  k_s_hist.template sync<DeviceType>();
@@ -372,63 +398,185 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const

/* ---------------------------------------------------------------------- */

// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
// d_val     - contains the non-zero entries of sparse matrix H
// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
// d_jlist   - contains the column index corresponding to each entry in d_val
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
{
template<int NEIGHFLAG>
void
FixQEqReaxKokkos<DeviceType>::compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team,
					     int atoms_per_team,
					     int vector_length) const{

    // scratch space setup
    Kokkos::View< int*,      Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_ilist(team.team_shmem(), atoms_per_team);
    Kokkos::View< int*,      Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_numnbrs(team.team_shmem(), atoms_per_team);
    Kokkos::View< int*,      Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_firstnbr(team.team_shmem(), atoms_per_team);

    Kokkos::View< int**,     Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_jtype(team.team_shmem(), atoms_per_team, vector_length);
    Kokkos::View< int**,     Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_jlist(team.team_shmem(), atoms_per_team, vector_length);
    Kokkos::View< F_FLOAT**, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_r(team.team_shmem(), atoms_per_team, vector_length);

    // team of threads work on atoms with index in [firstatom, lastatom)
    int firstatom = team.league_rank() * atoms_per_team;
    int lastatom  = ( firstatom + atoms_per_team < inum ) ? ( firstatom + atoms_per_team ) : inum;

    // kokkos-thread-0 is used to load info from global memory into scratch space
    if(team.team_rank() == 0){

	// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
	// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
	// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
	// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])

	Kokkos::parallel_scan( Kokkos::ThreadVectorRange(team, atoms_per_team), [&](const int &idx, int &totalnbrs, bool final) {
		int ii = firstatom + idx;

		if(ii < inum){
		    const int i = d_ilist[ii];
  int j,jj,jtype;
		    int jnum    = d_numneigh[i];

  if (mask[i] & groupbit) {
		    if(final){
			s_ilist[idx]    = i;
			s_numnbrs[idx]  = jnum;
			s_firstnbr[idx] = totalnbrs;
		    }
		    totalnbrs      += jnum;
		}else{
		    s_numnbrs[idx]  = 0;
		}
	    });
    }


    // barrier ensures that the data moved to scratch space is visible to all the threads of the corresponding team
    team.team_barrier();

    // calculate the global memory offset from where the H matrix values to be calculated by the current team will be stored in d_val
    int team_firstnbr_idx = 0;
    Kokkos::single (Kokkos::PerTeam (team), [=] (int &val) {
	    int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + s_numnbrs[lastatom - firstatom - 1];
	    val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
	}, team_firstnbr_idx);


    // map the H matrix computation of each atom to kokkos-thread (one atom per kokkos-thread)
    // neighbor computation for each atom is assigned to vector lanes of the corresponding thread
    Kokkos::parallel_for( Kokkos::TeamThreadRange(team, atoms_per_team), [&] (const int &idx) {
	    int ii = firstatom + idx;

	    if(ii < inum){
		const int i = s_ilist[idx];

		if (mask[i] & groupbit) {
		    const X_FLOAT xtmp = x(i,0);
		    const X_FLOAT ytmp = x(i,1);
		    const X_FLOAT ztmp = x(i,2);
		    const int itype    = type(i);
		    const tagint itag  = tag(i);
    const int jnum = d_numneigh[i];
    if (final)
      d_firstnbr[i] = m_fill;
		    const int jnum     = s_numnbrs[idx];

		    // calculate the write-offset for atom-i's first neighbor
		    int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
		    Kokkos::single (Kokkos::PerThread (team), [&] () {
			    d_firstnbr[i] = atomi_firstnbr_idx;
			});


		    // current # of neighbor atoms with non-zero electrostatic interaction coefficients with atom-i
		    // which represents the # of non-zero elements in row-i of H matrix
		    int atomi_nbrs_inH = 0;

		    // calculate H matrix values corresponding to atom-i where neighbors are processed in batches and the batch size is vector_length
		    for(int jj_start = 0; jj_start < jnum; jj_start += vector_length){

			int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;

			// count the # of neighbor atoms with non-zero electrostatic interaction coefficients with atom-i in the current batch
			int atomi_nbrs_curbatch = 0;

			// compute rsq, jtype, j and store in scratch space which is reused later
			Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_length), [&](const int &idx, int &m_fill) {
				const int jj = jj_start + idx;

				// initialize: -1 represents no interaction with atom-j where j = d_neighbors(i,jj)
				s_jlist(team.team_rank(), idx) = -1;

    for (jj = 0; jj < jnum; jj++) {
      j = d_neighbors(i,jj);
				if(jj < jnum){
				    int j = d_neighbors(i,jj);
				    j &= NEIGHMASK;
      jtype = type(j);
				    const int jtype = type(j);

				    const X_FLOAT delx = x(j,0) - xtmp;
				    const X_FLOAT dely = x(j,1) - ytmp;
				    const X_FLOAT delz = x(j,2) - ztmp;

      if (neighflag != FULL) {
				    // valid nbr interaction
				    bool valid = true;
				    if (NEIGHFLAG != FULL) {
					// skip half of the interactions
					const tagint jtag = tag(j);
					if (j >= nlocal) {
					    if (itag > jtag) {
            if ((itag+jtag) % 2 == 0) continue;
						if ((itag+jtag) % 2 == 0)
						    valid = false;
					    } else if (itag < jtag) {
            if ((itag+jtag) % 2 == 1) continue;
						if ((itag+jtag) % 2 == 1)
						    valid = false;
					    } else {
            if (x(j,2) < ztmp) continue;
            if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
            if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
						if (x(j,2) < ztmp)
						    valid = false;
						if (x(j,2) == ztmp && x(j,1)  < ytmp)
						    valid = false;
						if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp)
						    valid = false;
					    }
					}
				    }

				    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
      if (rsq > cutsq) continue;
				    if (rsq > cutsq)
					valid = false;

				    if(valid){
					s_jlist(team.team_rank(), idx) = j;
					s_jtype(team.team_rank(), idx) = jtype;
					s_r(team.team_rank(), idx)     = sqrt(rsq);
					m_fill++;
				    }
				}
			    }, atomi_nbrs_curbatch);

			// write non-zero entries of H to global memory
			Kokkos::parallel_scan( Kokkos::ThreadVectorRange(team, vector_length), [&](const int &idx, int &m_fill, bool final) {
				int j  = s_jlist(team.team_rank(), idx);
				if(final){
        const F_FLOAT r = sqrt(rsq);
        d_jlist(m_fill) = j;
				    if(j != -1){
					const int jtype      = s_jtype(team.team_rank(), idx);
					const F_FLOAT r      = s_r(team.team_rank(), idx);
					const F_FLOAT shldij = d_shield(itype,jtype);
        d_val(m_fill) = calculate_H_k(r,shldij);

					d_jlist[atomi_nbr_writeIdx + m_fill] = j;
					d_val[atomi_nbr_writeIdx + m_fill]   = calculate_H_k(r, shldij);
				    }
				}

				if(j !=-1){
				    m_fill++;
				}
    if (final)
      d_numnbrs[i] = m_fill - d_firstnbr[i];
			    });
			atomi_nbrs_inH += atomi_nbrs_curbatch;
		    }

		    Kokkos::single (Kokkos::PerThread (team), [&] () {
			    d_numnbrs[i] = atomi_nbrs_inH;
			});
		}
	    }
	});

}

/* ---------------------------------------------------------------------- */
+38 −11
Original line number Diff line number Diff line
@@ -34,6 +34,9 @@ struct TagSparseMatvec2 {};
struct TagSparseMatvec3 {};
struct TagZeroQGhosts{};

template<int NEIGHFLAG>
struct TagComputeHItem{};

template<class DeviceType>
class FixQEqReaxKokkos : public FixQEqReax {
 public:
@@ -53,8 +56,9 @@ class FixQEqReaxKokkos : public FixQEqReax {
  KOKKOS_INLINE_FUNCTION
  void zero_item(int) const;

  template<int NEIGHFLAG>
  KOKKOS_INLINE_FUNCTION
  void compute_h_item(int, int &, const bool &) const;
  void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;

  KOKKOS_INLINE_FUNCTION
  void matvec_item(int) const;
@@ -80,6 +84,10 @@ class FixQEqReaxKokkos : public FixQEqReax {
  KOKKOS_INLINE_FUNCTION
  void sparse33_item(int) const;

  template<int NEIGHFLAG>
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeHItem<NEIGHFLAG>, const typename Kokkos::TeamPolicy <DeviceType, TagComputeHItem<NEIGHFLAG> > ::member_type &team) const;

  typedef typename Kokkos::TeamPolicy <DeviceType, TagSparseMatvec1> ::member_type membertype1;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagSparseMatvec1, const membertype1 &team) const;
@@ -150,6 +158,9 @@ class FixQEqReaxKokkos : public FixQEqReax {
  int allocated_flag;
  int need_dup;

  DAT::tdual_int_scalar k_mfill_offset;
  typename AT::t_int_scalar d_mfill_offset;

  typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
  Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
  typename Kokkos::DualView<params_qeq*, Kokkos::LayoutRight,DeviceType>::t_dev_const params;
@@ -247,16 +258,32 @@ struct FixQEqReaxKokkosMatVecFunctor {
  }
};

template <class DeviceType>
template <class DeviceType, int NEIGHFLAG>
struct FixQEqReaxKokkosComputeHFunctor  {
  typedef DeviceType  device_type ;
    int atoms_per_team, vector_length;
    typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
    FixQEqReaxKokkos<DeviceType> c;
  FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {

    FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr,
				    int _atoms_per_team,
				    int _vector_length):
	c(*c_ptr), atoms_per_team(_atoms_per_team), vector_length(_vector_length) {
	c.cleanup_copy();
    };

    KOKKOS_INLINE_FUNCTION
  void operator()(const int ii, int &m_fill, const bool &final) const {
    c.compute_h_item(ii,m_fill,final);
    void operator()(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team) const {
	c.template compute_h_team<NEIGHFLAG> (team, atoms_per_team, vector_length);
    }

    size_t team_shmem_size( int team_size ) const {
	size_t shmem_size = Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) + // s_ilist
	    Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) + // s_numnbrs
	    Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) +  // s_firstnbr
	    Kokkos::View<int**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) + //s_jtype
	    Kokkos::View<int**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) + //s_j
	    Kokkos::View<F_FLOAT**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) ; //s_r
	return shmem_size;
    }
};