Commit 4d31afce authored by Stan Moore's avatar Stan Moore
Browse files

Port changes to PPPMKokkos

parent e2923d2d
Loading
Loading
Loading
Loading
+31 −33
Original line number Diff line number Diff line
@@ -101,7 +101,7 @@ GridCommKokkos<DeviceType>::~GridCommKokkos()
    send[i].packlist = NULL;

  for (int i = 0; i < nrecv; i++)
    k_recv_unpacklist,i = NULL;
    recv[i].unpacklist = NULL;

  for (int i = 0; i < ncopy; i++) {
    copy[i].packlist = NULL;
@@ -487,7 +487,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)

  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridCommKokkos:send");

  k_send_packlist = DAT::t_int_1d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));
  k_send_packlist = DAT::tdual_int_2d("GridCommKokkos:send_packlist",nrecv_request,k_send_packlist.extent(1));

  sresponse = (Response *)
    memory->smalloc(nrecv_request*sizeof(Response),"GridCommKokkos:sresponse");
@@ -533,7 +533,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
  
  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"GridCommKokkos:recv");

  k_recv_unpacklist = DAT::t_int_1d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));
  k_recv_unpacklist = DAT::tdual_int_2d("GridCommKokkos:recv_unpacklist",nrecv_response,k_recv_unpacklist.extent(1));

  adjacent = 1;
  
@@ -546,7 +546,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
    recv[i].nunpack = indices(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
    recv[i].nunpack = indices_kokkos(k_recv_unpacklist,i,xlo,xhi,ylo,yhi,zlo,zhi);
    
    if (xlo != inxhi+1 && xhi != inxlo-1 &&
	ylo != inyhi+1 && yhi != inylo-1 &&
@@ -559,8 +559,8 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)
  
  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"GridCommKokkos:copy");

  k_copy_packlist = DAT::t_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
  k_copy_unpacklist = DAT::t_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
  k_copy_packlist = DAT::tdual_int_2d("GridCommKokkos:copy_packlist",ncopy,k_copy_packlist.extent(1));
  k_copy_unpacklist = DAT::tdual_int_2d("GridCommKokkos:copy_unpacklist",ncopy,k_copy_unpacklist.extent(1));
 
  ncopy = 0;
  for (m = 0; m < noverlap; m++) {
@@ -643,7 +643,7 @@ void GridCommKokkos<DeviceType>::setup_tiled(int &nbuf1, int &nbuf2)

template<class DeviceType>
void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
				   FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
				   FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  if (layout == REGULAR)
    forward_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
@@ -656,7 +656,7 @@ void GridCommKokkos<DeviceType>::forward_comm_kspace(KSpace *kspace, int nper, i
template<class DeviceType>
void GridCommKokkos<DeviceType>::
forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  int m;
  MPI_Request request;
@@ -674,9 +674,9 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,

  for (m = 0; m < nswap; m++) {
    if (swap[m].sendproc == me)
      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf2,swap[m].npack,k_swap_packlist,m);
      kspaceKKBase->pack_forward_grid_kokkos(which,k_buf2,swap[m].npack,k_swap_packlist,m);
    else
      KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,swap[m].npack,k_swap_packlist,m);
      kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,swap[m].npack,k_swap_packlist,m);
    DeviceType().fence();

    if (swap[m].sendproc != me) {
@@ -698,7 +698,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
      }
    }

    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,0,swap[m].nunpack,k_swap_unpacklist,m);
    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,0,swap[m].nunpack,k_swap_unpacklist,m);
    DeviceType().fence();
  }
}
@@ -708,12 +708,10 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
template<class DeviceType>
void GridCommKokkos<DeviceType>::
forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  int i,m,offset;

  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);

  KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
  FFT_SCALAR* buf1;
  FFT_SCALAR* buf2;
@@ -729,14 +727,14 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
  
  for (m = 0; m < nrecv; m++) {
    offset = nper * recv[m].offset * nbyte;
    MPI_Irecv(buf2[offset],nper*recv[m].nunpack,datatype,
    MPI_Irecv(&buf2[offset],nper*recv[m].nunpack,datatype,
	      recv[m].proc,0,gridcomm,&requests[m]);
  }

  // perform all sends to other procs

  for (m = 0; m < nsend; m++) {
    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
    kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
    DeviceType().fence();

    if (!lmp->kokkos->cuda_aware_flag) {
@@ -750,8 +748,8 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
  // perform all copies to self

  for (m = 0; m < ncopy; m++) {
    KokkosBaseFFT->pack_forward_grid_kokkos(which,k_buf1,copy[m].npack,k_copy_packlist,m);
    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf1,0,copy[m].nunpack,k_copy_unpacklist,m);
    kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,copy[m].npack,k_copy_packlist,m);
    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf1,0,copy[m].nunpack,k_copy_unpacklist,m);
  }

  // unpack all received data
@@ -765,7 +763,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
    }

    offset = nper * recv[m].offset * nbyte;
    KokkosBaseFFT->unpack_forward_grid_kokkos(which,k_buf2,offset,
    kspaceKKBase->unpack_forward_grid_kokkos(which,k_buf2,offset,
				recv[m].nunpack,k_recv_unpacklist,m);
    DeviceType().fence();
  }
@@ -778,7 +776,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,

template<class DeviceType>
void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, int nbyte, int which,
				    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
				    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  if (layout == REGULAR)
    reverse_comm_kspace_regular(kspace,nper,nbyte,which,k_buf1,k_buf2,datatype);
@@ -791,7 +789,7 @@ void GridCommKokkos<DeviceType>::reverse_comm_kspace(KSpace *kspace, int nper, i
template<class DeviceType>
void GridCommKokkos<DeviceType>::
reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
			    FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
			    FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  int m;
  MPI_Request request;
@@ -809,9 +807,9 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,

  for (m = nswap-1; m >= 0; m--) {
    if (swap[m].recvproc == me)
      KokkosBaseFFT->pack_reverse_grid(which,k_buf2,swap[m].nunpack,k_swap_unpacklist,m);
      kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf2,swap[m].nunpack,k_swap_unpacklist,m);
    else
      KokkosBaseFFT->pack_reverse_grid(which,k_buf1,swap[m].nunpack,k_swap_unpacklist,m);
      kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,swap[m].nunpack,k_swap_unpacklist,m);
    DeviceType().fence();

    if (swap[m].recvproc != me) {
@@ -834,7 +832,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
      }
    }

    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,0,swap[m].npack,k_swap_packlist,m);
    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,0,swap[m].npack,k_swap_packlist,m);
    DeviceType().fence();
  }
}
@@ -844,7 +842,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int nbyte, int which,
template<class DeviceType>
void GridCommKokkos<DeviceType>::
reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
			  FFT_DAT::tdual_FFT_SCALAR_1d k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d k_buf2, MPI_Datatype datatype)
			  FFT_DAT::tdual_FFT_SCALAR_1d &k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d &k_buf2, MPI_Datatype datatype)
{
  int i,m,offset;

@@ -866,14 +864,14 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
  
  for (m = 0; m < nsend; m++) {
    offset = nper * send[m].offset * nbyte;
    MPI_Irecv(buf2[offset],nper*send[m].npack,datatype,
    MPI_Irecv(&buf2[offset],nper*send[m].npack,datatype,
	      send[m].proc,0,gridcomm,&requests[m]);
  }

  // perform all sends to other procs

  for (m = 0; m < nrecv; m++) {
    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
    kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
    DeviceType().fence();

    if (!lmp->kokkos->cuda_aware_flag) {
@@ -887,8 +885,8 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
  // perform all copies to self

  for (m = 0; m < ncopy; m++) {
    KokkosBaseFFT->pack_reverse_grid(which,k_buf1,copy[m].nunpack,k_copy_unpacklist,m);
    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf1,0,copy[m].npack,k_copy_packlist,m);
    kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,copy[m].nunpack,k_copy_unpacklist,m);
    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf1,0,copy[m].npack,k_copy_packlist,m);
  }

  // unpack all received data
@@ -902,7 +900,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int nbyte, int which,
    }

    offset = nper * send[m].offset * nbyte;
    KokkosBaseFFT->unpack_reverse_grid_kokkos(which,k_buf2,offset,
    kspaceKKBase->unpack_reverse_grid_kokkos(which,k_buf2,offset,
				send[m].npack,k_send_packlist,m);
    DeviceType().fence();
  }
@@ -923,9 +921,9 @@ void GridCommKokkos<DeviceType>::grow_swap()
  swap = (Swap *)
    memory->srealloc(swap,maxswap*sizeof(Swap),"GridCommKokkos:swap");

  if (!k_swap_packlist.data() {
    k_swap_packlist = DAT::t_int_1d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
    k_swap_unpacklist = DAT::t_int_1d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
  if (!k_swap_packlist.d_view.data()) {
    k_swap_packlist = DAT::tdual_int_2d("GridCommKokkos:swap_packlist",maxswap,k_swap_packlist.extent(1));
    k_swap_unpacklist = DAT::tdual_int_2d("GridCommKokkos:swap_unpacklist",maxswap,k_swap_unpacklist.extent(1));
  } else {
    k_swap_packlist.resize(maxswap,k_swap_packlist.extent(1));
    k_swap_unpacklist.resize(maxswap,k_swap_unpacklist.extent(1));
+6 −8
Original line number Diff line number Diff line
@@ -34,12 +34,10 @@ class GridCommKokkos : public GridComm {
	   int, int, int, int, int, int,
	   int, int, int, int, int, int);
  ~GridCommKokkos();
  void setup(int &, int &);
  int ghost_adjacent();
  void forward_comm_kspace(class KSpace *, int, int, int,
			   void *, void *, MPI_Datatype);
			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
  void reverse_comm_kspace(class KSpace *, int, int, int,
			   void *, void *, MPI_Datatype);
			   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);

 private:
  DAT::tdual_int_2d k_swap_packlist;
@@ -60,13 +58,13 @@ class GridCommKokkos : public GridComm {
  void setup_tiled(int &, int &);

  void forward_comm_kspace_regular(class KSpace *, int, int, int,
                                   void *, void *, MPI_Datatype);
                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
  void forward_comm_kspace_tiled(class KSpace *, int, int, int,
                                 void *, void *, MPI_Datatype);
                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
  void reverse_comm_kspace_regular(class KSpace *, int, int, int,
                                   void *, void *, MPI_Datatype);
                                   FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);
  void reverse_comm_kspace_tiled(class KSpace *, int, int, int,
                                 void *, void *, MPI_Datatype);
                                 FFT_DAT::tdual_FFT_SCALAR_1d &, FFT_DAT::tdual_FFT_SCALAR_1d &, MPI_Datatype);

  void grow_swap();
  
+4 −4
Original line number Diff line number Diff line
@@ -23,10 +23,10 @@ class KokkosBaseFFT {
  KokkosBaseFFT() {}

  //Kspace
  virtual void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
  virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
  virtual void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
  virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
  virtual void pack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
  virtual void unpack_forward_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
  virtual void pack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
  virtual void unpack_reverse_grid_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, int, DAT::tdual_int_2d &, int) {};
};

}
+53 −200
Original line number Diff line number Diff line
@@ -96,8 +96,7 @@ PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp) : PPPM(lmp)

  fft1 = fft2 = NULL;
  remap = NULL;
  cg = NULL;
  cg_peratom = NULL;
  gc = NULL;

  nmax = 0;
  //part2grid = NULL;
@@ -255,9 +254,7 @@ void PPPMKokkos<DeviceType>::init()
  //   or overlap is allowed, then done
  // else reduce order and try again

  int (*procneigh)[2] = comm->procneigh;

  GridCommKokkos<DeviceType> *cgtmp = NULL;
  GridCommKokkos<DeviceType> *gctmp = NULL;
  int iteration = 0;

  while (order >= minorder) {
@@ -269,24 +266,23 @@ void PPPMKokkos<DeviceType>::init()
    set_grid_local();
    if (overlap_allowed) break;

    cgtmp = new GridCommKokkos<DeviceType>(lmp,world,1,1,
    gctmp = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
    cgtmp->ghost_notify();
    if (!cgtmp->ghost_overlap()) break;
    delete cgtmp;
                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
    int tmp1,tmp2;
    gctmp->setup(tmp1,tmp2);
    if (!gctmp->ghost_adjacent()) break;
    delete gctmp;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
  if (!overlap_allowed && cgtmp->ghost_overlap())
  if (!overlap_allowed && gctmp->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
  if (cgtmp) delete cgtmp;
  if (gctmp) delete gctmp;

  // adjust g_ewald

@@ -320,8 +316,6 @@ void PPPMKokkos<DeviceType>::init()
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();
  cg->ghost_notify();
  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@@ -564,11 +558,9 @@ void PPPMKokkos<DeviceType>::setup_grid()

  allocate();

  cg->ghost_notify();
  if (overlap_allowed == 0 && cg->ghost_overlap())
  if (!overlap_allowed && !gc->ghost_adjacent())
    error->all(FLERR,"PPPM grid stencil extends "
               "beyond nearest neighbor processor");
  cg->setup();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients
@@ -576,7 +568,7 @@ void PPPMKokkos<DeviceType>::setup_grid()
  compute_gf_denom();
  compute_rho_coeff();

  // pre-compute volume-dependent coeffs
  // pre-compute volume-dependent coeffs for portion of grid I now own

  setup();
}
@@ -609,11 +601,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
    d_vatom = k_vatom.view<DeviceType>();
  }

  if (evflag_atom && !peratom_allocate_flag) {
  if (evflag_atom && !peratom_allocate_flag)
    allocate_peratom();
    cg_peratom->ghost_notify();
    cg_peratom->setup();
  }

  x = atomKK->k_x.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
@@ -667,7 +656,8 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

  cg->reverse_comm(this,REVERSE_RHO);
  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
                          k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);
  brick2fft();

  // compute potential gradient on my FFT grid and
@@ -680,12 +670,14 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  cg->forward_comm(this,FORWARD_IK);
  gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
			  k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom)
      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
                              k_gc_buf1,k_gc_buf2,MPI_FFT_SCALAR);

  // calculate the force on my particles

@@ -731,6 +723,7 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
      copymode = 1;
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self1>(0,nlocal),*this);
      copymode = 0;
      copymode = 0;
      //for (i = nlocal; i < ntotal; i++) d_eatom[i] *= 0.5*qscale;
    }

@@ -844,14 +837,19 @@ void PPPMKokkos<DeviceType>::allocate()
                          1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);

  // create ghost grid object for rho and electric field communication
  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods

  int (*procneigh)[2] = comm->procneigh;

  cg = new GridCommKokkos<DeviceType>(lmp,world,3,1,
  gc = new GridCommKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
                    procneigh[0][0],procneigh[0][1],procneigh[1][0],
                    procneigh[1][1],procneigh[2][0],procneigh[2][1]);
                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);

  gc->setup(ngc_buf1,ngc_buf2);

  if (differentiation_flag) npergrid = 1;
   else npergrid = 3;

  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
}

/* ----------------------------------------------------------------------
@@ -876,8 +874,8 @@ void PPPMKokkos<DeviceType>::deallocate()
  fft2 = NULL;
  delete remap;
  remap = NULL;
  delete cg;
  cg = NULL;
  delete gc;
  gc = NULL;
}

/* ----------------------------------------------------------------------
@@ -899,16 +897,14 @@ void PPPMKokkos<DeviceType>::allocate_peratom()
  d_v5_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);


  // create ghost grid object for rho and electric field communication
  // use same GC ghost grid object for peratom grid communication
   // but need to reallocate a larger gc_buf1 and gc_buf2

  int (*procneigh)[2] = comm->procneigh;
  if (differentiation_flag) npergrid = 6;
   else npergrid = 7;

  cg_peratom =
    new GridCommKokkos<DeviceType>(lmp,world,7,1,
                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
  k_gc_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf1",npergrid*ngc_buf1);
  k_gc_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("pppm:gc_buf2",npergrid*ngc_buf2);
}

/* ----------------------------------------------------------------------
@@ -919,9 +915,6 @@ template<class DeviceType>
void PPPMKokkos<DeviceType>::deallocate_peratom()
{
  peratom_allocate_flag = 0;

  delete cg_peratom;
  cg_peratom = NULL;
}

/* ----------------------------------------------------------------------
@@ -1185,153 +1178,11 @@ double PPPMKokkos<DeviceType>::final_accuracy()
template<class DeviceType>
void PPPMKokkos<DeviceType>::set_grid_local()
{
  // global indices of PPPM grid range from 0 to N-1
  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
  //   global PPPM grid that I own without ghost cells
  // for slab PPPM, assign z grid as if it were not extended

  nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
  nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;

  nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
  nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;

  nzlo_in = static_cast<int>
      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
  nzhi_in = static_cast<int>
      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;

  // nlower,nupper = stencil size for mapping particles to PPPM grid

  nlower = -(order-1)/2;
  nupper = order/2;

  // shift values for particle <-> grid mapping
  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1

  if (order % 2) shift = OFFSET + 0.5;
  else shift = OFFSET;
  if (order % 2) shiftone = 0.0;
  else shiftone = 0.5;

  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
  //   global PPPM grid that my particles can contribute charge to
  // effectively nlo_in,nhi_in + ghost cells
  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
  //           position a particle in my box can be at
  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
  //   qdist = offset due to TIP4P fictitious charge
  //   convert to triclinic if necessary
  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
  // for slab PPPM, assign z grid as if it were not extended
  PPPM::set_grid_local();

  double *prd,*sublo,*subhi;

  if (triclinic == 0) {
    prd = domain->prd;
  boxlo[0] = domain->boxlo[0];
  boxlo[1] = domain->boxlo[1];
  boxlo[2] = domain->boxlo[2];
    sublo = domain->sublo;
    subhi = domain->subhi;
  } else {
    prd = domain->prd_lamda;
    boxlo[0] = domain->boxlo_lamda[0];
    boxlo[1] = domain->boxlo_lamda[1];
    boxlo[2] = domain->boxlo_lamda[2];
    domain->x2lamda(atomKK->nlocal);
    sublo = domain->sublo_lamda;
    subhi = domain->subhi_lamda;
  }

  double xprd = prd[0];
  double yprd = prd[1];
  double zprd = prd[2];
  double zprd_slab = zprd*slab_volfactor;

  double dist[3];
  double cuthalf = 0.5*neighbor->skin + qdist;
  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
  else kspacebbox(cuthalf,&dist[0]);

  int nlo,nhi;

  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
                            nx_pppm/xprd + shift) - OFFSET;
  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
                            nx_pppm/xprd + shift) - OFFSET;
  nxlo_out = nlo + nlower;
  nxhi_out = nhi + nupper;

  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
                            ny_pppm/yprd + shift) - OFFSET;
  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
                            ny_pppm/yprd + shift) - OFFSET;
  nylo_out = nlo + nlower;
  nyhi_out = nhi + nupper;

  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
                            nz_pppm/zprd_slab + shift) - OFFSET;
  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
                            nz_pppm/zprd_slab + shift) - OFFSET;
  nzlo_out = nlo + nlower;
  nzhi_out = nhi + nupper;

  // for slab PPPM, change the grid boundary for processors at +z end
  //   to include the empty volume between periodically repeating slabs
  // for slab PPPM, want charge data communicated from -z proc to +z proc,
  //   but not vice versa, also want field data communicated from +z proc to
  //   -z proc, but not vice versa
  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
  // also insure no other procs use ghost cells beyond +z limit

  if (slabflag == 1) {
    if (comm->myloc[2] == comm->procgrid[2]-1)
      nzhi_in = nzhi_out = nz_pppm - 1;
    nzhi_out = MIN(nzhi_out,nz_pppm-1);
  }

  // decomposition of FFT mesh
  // global indices range from 0 to N-1
  // proc owns entire x-dimension, clumps of columns in y,z dimensions
  // npey_fft,npez_fft = # of procs in y,z dims
  // if nprocs is small enough, proc can own 1 or more entire xy planes,
  //   else proc owns 2d sub-blocks of yz plane
  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
  // nlo_fft,nhi_fft = lower/upper limit of the section
  //   of the global FFT mesh that I own

  int npey_fft,npez_fft;
  if (nz_pppm >= nprocs) {
    npey_fft = 1;
    npez_fft = nprocs;
  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);

  int me_y = me % npey_fft;
  int me_z = me / npey_fft;

  nxlo_fft = 0;
  nxhi_fft = nx_pppm - 1;
  nylo_fft = me_y*ny_pppm/npey_fft;
  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
  nzlo_fft = me_z*nz_pppm/npez_fft;
  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;

  // PPPM grid pts owned by this proc, including ghosts

  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);

  // FFT grids owned by this proc, without ghosts
  // nfft = FFT points in FFT decomposition on this proc
  // nfft_brick = FFT points in 3d brick-decomposition on this proc
  // nfft_both = greater of 2 values

  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
    (nzhi_fft-nzlo_fft+1);
  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
    (nzhi_in-nzlo_in+1);
  nfft_both = MAX(nfft,nfft_brick);
}

/* ----------------------------------------------------------------------
@@ -2668,12 +2519,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) c
  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
  if (eflag_atom) d_u_brick(iz,iy,ix) = d_buf[7*i];
  if (vflag_atom) {
    d_v0_brick(iz,iy,ix) = d_buf[7*i+1];
    d_v1_brick(iz,iy,ix) = d_buf[7*i+2];
    d_v2_brick(iz,iy,ix) = d_buf[7*i+3];
    d_v3_brick(iz,iy,ix) = d_buf[7*i+4];
    d_v4_brick(iz,iy,ix) = d_buf[7*i+5];
    d_v5_brick(iz,iy,ix) = d_buf[7*i+6];
    d_v0_brick(iz,iy,ix) = d_buf[7*i+1 + unpack_offset];
    d_v1_brick(iz,iy,ix) = d_buf[7*i+2 + unpack_offset];
    d_v2_brick(iz,iy,ix) = d_buf[7*i+3 + unpack_offset];
    d_v3_brick(iz,iy,ix) = d_buf[7*i+4 + unpack_offset];
    d_v4_brick(iz,iy,ix) = d_buf[7*i+5 + unpack_offset];
    d_v5_brick(iz,iy,ix) = d_buf[7*i+6 + unpack_offset];
  }
}

@@ -3046,7 +2897,9 @@ double PPPMKokkos<DeviceType>::memory_usage()
  if (peratom_allocate_flag)
    bytes += 6 * nbrick * sizeof(FFT_SCALAR);

  if (cg) bytes += cg->memory_usage();
  // two GridComm bufs

  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);

  return bytes;
}
+10 −6

File changed.

Preview size limit exceeded, changes collapsed.