Commit f8226508 authored by Stan Moore's avatar Stan Moore
Browse files

Allow Kokkos remap to turn off Cuda-aware MPI

parent 5c2f0ecc
Loading
Loading
Loading
Loading
+12 −7
Original line number Diff line number Diff line
@@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
             int in_klo, int in_khi,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_klo, int out_khi,
             int scaled, int permute, int *nbuf, int usecollective) :
             int scaled, int permute, int *nbuf, int usecollective,
             int usecuda_aware) :
  Pointers(lmp)
{
  int nthreads = lmp->kokkos->nthreads;
@@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
  plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                            scaled,permute,nbuf,usecollective,nthreads);
                            scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
  if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
}

@@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
                          2 = permute twice = slow->fast, fast->mid, mid->slow
   nbuf                 returns size of internal storage buffers used by FFT
   usecollective        use collective MPI operations for remapping data
   usecuda_aware        use CUDA-Aware MPI or not
------------------------------------------------------------------------- */

template<class DeviceType>
@@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_klo, int out_khi,
       int scaled, int permute, int *nbuf, int usecollective,
       int nthreads)
       int nthreads, int usecuda_aware)
{
  struct fft_plan_3d_kokkos<DeviceType> *plan;
  int me,nprocs;
@@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
    plan->pre_plan =
      remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,2,0,0,FFT_PRECISION,0);
                           first_klo,first_khi,2,0,0,FFT_PRECISION,
                           usecollective,usecuda_aware);
    if (plan->pre_plan == NULL) return NULL;
  }

@@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                           first_klo,first_khi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_klo,second_khi,2,1,0,FFT_PRECISION,
                           usecollective);
                           usecollective,usecuda_aware);
  if (plan->mid1_plan == NULL) return NULL;

  // 1d FFTs along mid axis
@@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,
                         usecollective,usecuda_aware);
  if (plan->mid2_plan == NULL) return NULL;

  // 1d FFTs along slow axis
@@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
                           usecollective,usecuda_aware);
    if (plan->post_plan == NULL) return NULL;
  }

+2 −2
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {

  FFT3dKokkos(class LAMMPS *, MPI_Comm,
        int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
        int,int,int *,int);
        int,int,int *,int,int);
  ~FFT3dKokkos();
  void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
  void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
@@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
  struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
                                         int, int, int, int, int,
                                         int, int, int, int, int, int, int,
                                         int, int, int *, int, int);
                                         int, int, int *, int, int, int);

  void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);

+5 −3
Original line number Diff line number Diff line
@@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
  // 2nd FFT returns data in 3d brick decomposition
  // remap takes data from 3d brick to FFT decomposition

  int collective_flag = 0; // not yet supported in Kokkos version
  int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
  int tmp;

  fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         0,0,&tmp,collective_flag);
                         0,0,&tmp,collective_flag,cuda_aware_flag);

  fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                         0,0,&tmp,collective_flag);
                         0,0,&tmp,collective_flag,cuda_aware_flag);
  remap = new RemapKokkos<DeviceType>(lmp,world,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          1,0,0,FFT_PRECISION,collective_flag);
                          1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);

  // create ghost grid object for rho and electric field communication

+29 −5
Original line number Diff line number Diff line
@@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_klo, int out_khi,
             int nqty, int permute, int memory,
             int precision, int usecollective) : Pointers(lmp)
             int precision, int usecollective,
             int usecuda_aware) : Pointers(lmp)
{
  plan = remap_3d_create_plan_kokkos(comm,
                              in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                              out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                              nqty,permute,memory,precision,usecollective);
                              nqty,permute,memory,precision,usecollective,
                              usecuda_aware);
  if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
}

@@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d

  // post all recvs into scratch space

  FFT_SCALAR* v_scratch = d_scratch.data();
  if (!plan->usecuda_aware) {
    plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
    v_scratch = plan->h_scratch.data();
  }

  for (irecv = 0; irecv < plan->nrecv; irecv++) {
    FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv];
    FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
    MPI_Irecv(scratch,plan->recv_size[irecv],
              MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
              plan->comm,&plan->request[irecv]);
  }

  FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
  if (!plan->usecuda_aware) {
    plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
    v_sendbuf = plan->h_sendbuf.data();
  }

  // send all messages to other procs

  for (isend = 0; isend < plan->nsend; isend++) {
    int in_offset = plan->send_offset[isend];
    plan->pack(d_in,in_offset,
               plan->d_sendbuf,0,&plan->packplan[isend]);
    MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,

    if (!plan->usecuda_aware)
      Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);

    MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
             plan->send_proc[isend],0,plan->comm);
  }

@@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
    int scratch_offset = plan->recv_bufloc[irecv];
    int out_offset = plan->recv_offset[irecv];

    if (!plan->usecuda_aware)
      Kokkos::deep_copy(d_scratch,plan->h_scratch);

    plan->unpack(d_scratch,scratch_offset,
                 d_out,out_offset,&plan->unpackplan[irecv]);
  }
@@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
                          1 = single precision (4 bytes per datum)
                          2 = double precision (8 bytes per datum)
   usecollective        whether to use collective MPI or point-to-point
   usecuda_aware        whether to use CUDA-Aware MPI or not
------------------------------------------------------------------------- */

template<class DeviceType>
@@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
  int in_klo, int in_khi,
  int out_ilo, int out_ihi, int out_jlo, int out_jhi,
  int out_klo, int out_khi,
  int nqty, int permute, int memory, int precision, int usecollective)
  int nqty, int permute, int memory, int precision,
  int usecollective, int usecuda_aware)
{

  struct remap_plan_3d_kokkos<DeviceType> *plan;
@@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
  plan = new struct remap_plan_3d_kokkos<DeviceType>;
  if (plan == NULL) return NULL;
  plan->usecollective = usecollective;
  plan->usecuda_aware = usecuda_aware;

  // store parameters in local data structs

+5 −2
Original line number Diff line number Diff line
@@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
  typedef DeviceType device_type;
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf;                  // buffer for MPI sends
  FFT_HAT::t_FFT_SCALAR_1d h_sendbuf;                          // host buffer for MPI sends
  typename FFT_AT::t_FFT_SCALAR_1d d_scratch;                  // scratch buffer for MPI recvs
  FFT_HAT::t_FFT_SCALAR_1d h_scratch;                          // host scratch buffer for MPI recvs
  void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
                                    // which pack function to use
  void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
@@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
  int usecollective;                // use collective or point-to-point MPI
  int commringlen;                  // length of commringlist
  int *commringlist;                // ranks on communication ring of this plan
  int usecuda_aware;                // use CUDA-Aware MPI or not
};

template<class DeviceType>
@@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  RemapKokkos(class LAMMPS *);
  RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
        int,int,int,int,int,int,int,int,int,int,int);
        int,int,int,int,int,int,int,int,int,int,int,int);
  ~RemapKokkos();
  void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);

@@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
  struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int);
                                             int, int, int, int, int, int);
  void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
};