Commit f8226508 authored by Stan Moore's avatar Stan Moore
Browse files

Allow Kokkos remap to turn off Cuda-aware MPI

parent 5c2f0ecc
Loading
Loading
Loading
Loading
+12 −7
Original line number Original line Diff line number Diff line
@@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
             int in_klo, int in_khi,
             int in_klo, int in_khi,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_klo, int out_khi,
             int out_klo, int out_khi,
             int scaled, int permute, int *nbuf, int usecollective) :
             int scaled, int permute, int *nbuf, int usecollective,
             int usecuda_aware) :
  Pointers(lmp)
  Pointers(lmp)
{
{
  int nthreads = lmp->kokkos->nthreads;
  int nthreads = lmp->kokkos->nthreads;
@@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
  plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
  plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                            scaled,permute,nbuf,usecollective,nthreads);
                            scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
  if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
  if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
}
}


@@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
                          2 = permute twice = slow->fast, fast->mid, mid->slow
                          2 = permute twice = slow->fast, fast->mid, mid->slow
   nbuf                 returns size of internal storage buffers used by FFT
   nbuf                 returns size of internal storage buffers used by FFT
   usecollective        use collective MPI operations for remapping data
   usecollective        use collective MPI operations for remapping data
   usecuda_aware        use CUDA-Aware MPI or not
------------------------------------------------------------------------- */
------------------------------------------------------------------------- */


template<class DeviceType>
template<class DeviceType>
@@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_klo, int out_khi,
       int out_klo, int out_khi,
       int scaled, int permute, int *nbuf, int usecollective,
       int scaled, int permute, int *nbuf, int usecollective,
       int nthreads)
       int nthreads, int usecuda_aware)
{
{
  struct fft_plan_3d_kokkos<DeviceType> *plan;
  struct fft_plan_3d_kokkos<DeviceType> *plan;
  int me,nprocs;
  int me,nprocs;
@@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
    plan->pre_plan =
    plan->pre_plan =
      remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
      remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,2,0,0,FFT_PRECISION,0);
                           first_klo,first_khi,2,0,0,FFT_PRECISION,
                           usecollective,usecuda_aware);
    if (plan->pre_plan == NULL) return NULL;
    if (plan->pre_plan == NULL) return NULL;
  }
  }


@@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                           first_klo,first_khi,
                           first_klo,first_khi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_klo,second_khi,2,1,0,FFT_PRECISION,
                           second_klo,second_khi,2,1,0,FFT_PRECISION,
                           usecollective);
                           usecollective,usecuda_aware);
  if (plan->mid1_plan == NULL) return NULL;
  if (plan->mid1_plan == NULL) return NULL;


  // 1d FFTs along mid axis
  // 1d FFTs along mid axis
@@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,
                         usecollective,usecuda_aware);
  if (plan->mid2_plan == NULL) return NULL;
  if (plan->mid2_plan == NULL) return NULL;


  // 1d FFTs along slow axis
  // 1d FFTs along slow axis
@@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
                           usecollective,usecuda_aware);
    if (plan->post_plan == NULL) return NULL;
    if (plan->post_plan == NULL) return NULL;
  }
  }


+2 −2
Original line number Original line Diff line number Diff line
@@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {


  FFT3dKokkos(class LAMMPS *, MPI_Comm,
  FFT3dKokkos(class LAMMPS *, MPI_Comm,
        int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
        int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
        int,int,int *,int);
        int,int,int *,int,int);
  ~FFT3dKokkos();
  ~FFT3dKokkos();
  void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
  void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
  void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
  void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
@@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
  struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
  struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
                                         int, int, int, int, int,
                                         int, int, int, int, int,
                                         int, int, int, int, int, int, int,
                                         int, int, int, int, int, int, int,
                                         int, int, int *, int, int);
                                         int, int, int *, int, int, int);


  void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
  void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);


+5 −3
Original line number Original line Diff line number Diff line
@@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
  // 2nd FFT returns data in 3d brick decomposition
  // 2nd FFT returns data in 3d brick decomposition
  // remap takes data from 3d brick to FFT decomposition
  // remap takes data from 3d brick to FFT decomposition


  int collective_flag = 0; // not yet supported in Kokkos version
  int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
  int tmp;
  int tmp;


  fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
  fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         0,0,&tmp,collective_flag);
                         0,0,&tmp,collective_flag,cuda_aware_flag);


  fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
  fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                         0,0,&tmp,collective_flag);
                         0,0,&tmp,collective_flag,cuda_aware_flag);
  remap = new RemapKokkos<DeviceType>(lmp,world,
  remap = new RemapKokkos<DeviceType>(lmp,world,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          1,0,0,FFT_PRECISION,collective_flag);
                          1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);


  // create ghost grid object for rho and electric field communication
  // create ghost grid object for rho and electric field communication


+29 −5
Original line number Original line Diff line number Diff line
@@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
             int out_klo, int out_khi,
             int out_klo, int out_khi,
             int nqty, int permute, int memory,
             int nqty, int permute, int memory,
             int precision, int usecollective) : Pointers(lmp)
             int precision, int usecollective,
             int usecuda_aware) : Pointers(lmp)
{
{
  plan = remap_3d_create_plan_kokkos(comm,
  plan = remap_3d_create_plan_kokkos(comm,
                              in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                              in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                              out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                              out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                              nqty,permute,memory,precision,usecollective);
                              nqty,permute,memory,precision,usecollective,
                              usecuda_aware);
  if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
  if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
}
}


@@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d


  // post all recvs into scratch space
  // post all recvs into scratch space


  FFT_SCALAR* v_scratch = d_scratch.data();
  if (!plan->usecuda_aware) {
    plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
    v_scratch = plan->h_scratch.data();
  }

  for (irecv = 0; irecv < plan->nrecv; irecv++) {
  for (irecv = 0; irecv < plan->nrecv; irecv++) {
    FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv];
    FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
    MPI_Irecv(scratch,plan->recv_size[irecv],
    MPI_Irecv(scratch,plan->recv_size[irecv],
              MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
              MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
              plan->comm,&plan->request[irecv]);
              plan->comm,&plan->request[irecv]);
  }
  }


  FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
  if (!plan->usecuda_aware) {
    plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
    v_sendbuf = plan->h_sendbuf.data();
  }

  // send all messages to other procs
  // send all messages to other procs


  for (isend = 0; isend < plan->nsend; isend++) {
  for (isend = 0; isend < plan->nsend; isend++) {
    int in_offset = plan->send_offset[isend];
    int in_offset = plan->send_offset[isend];
    plan->pack(d_in,in_offset,
    plan->pack(d_in,in_offset,
               plan->d_sendbuf,0,&plan->packplan[isend]);
               plan->d_sendbuf,0,&plan->packplan[isend]);
    MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,

    if (!plan->usecuda_aware)
      Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);

    MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
             plan->send_proc[isend],0,plan->comm);
             plan->send_proc[isend],0,plan->comm);
  }
  }


@@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
    int scratch_offset = plan->recv_bufloc[irecv];
    int scratch_offset = plan->recv_bufloc[irecv];
    int out_offset = plan->recv_offset[irecv];
    int out_offset = plan->recv_offset[irecv];


    if (!plan->usecuda_aware)
      Kokkos::deep_copy(d_scratch,plan->h_scratch);

    plan->unpack(d_scratch,scratch_offset,
    plan->unpack(d_scratch,scratch_offset,
                 d_out,out_offset,&plan->unpackplan[irecv]);
                 d_out,out_offset,&plan->unpackplan[irecv]);
  }
  }
@@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
                          1 = single precision (4 bytes per datum)
                          1 = single precision (4 bytes per datum)
                          2 = double precision (8 bytes per datum)
                          2 = double precision (8 bytes per datum)
   usecollective        whether to use collective MPI or point-to-point
   usecollective        whether to use collective MPI or point-to-point
   usecuda_aware        whether to use CUDA-Aware MPI or not
------------------------------------------------------------------------- */
------------------------------------------------------------------------- */


template<class DeviceType>
template<class DeviceType>
@@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
  int in_klo, int in_khi,
  int in_klo, int in_khi,
  int out_ilo, int out_ihi, int out_jlo, int out_jhi,
  int out_ilo, int out_ihi, int out_jlo, int out_jhi,
  int out_klo, int out_khi,
  int out_klo, int out_khi,
  int nqty, int permute, int memory, int precision, int usecollective)
  int nqty, int permute, int memory, int precision,
  int usecollective, int usecuda_aware)
{
{


  struct remap_plan_3d_kokkos<DeviceType> *plan;
  struct remap_plan_3d_kokkos<DeviceType> *plan;
@@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
  plan = new struct remap_plan_3d_kokkos<DeviceType>;
  plan = new struct remap_plan_3d_kokkos<DeviceType>;
  if (plan == NULL) return NULL;
  if (plan == NULL) return NULL;
  plan->usecollective = usecollective;
  plan->usecollective = usecollective;
  plan->usecuda_aware = usecuda_aware;


  // store parameters in local data structs
  // store parameters in local data structs


+5 −2
Original line number Original line Diff line number Diff line
@@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
  typedef DeviceType device_type;
  typedef DeviceType device_type;
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf;                  // buffer for MPI sends
  typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf;                  // buffer for MPI sends
  FFT_HAT::t_FFT_SCALAR_1d h_sendbuf;                          // host buffer for MPI sends
  typename FFT_AT::t_FFT_SCALAR_1d d_scratch;                  // scratch buffer for MPI recvs
  typename FFT_AT::t_FFT_SCALAR_1d d_scratch;                  // scratch buffer for MPI recvs
  FFT_HAT::t_FFT_SCALAR_1d h_scratch;                          // host scratch buffer for MPI recvs
  void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
  void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
                                    // which pack function to use
                                    // which pack function to use
  void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
  void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
@@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
  int usecollective;                // use collective or point-to-point MPI
  int usecollective;                // use collective or point-to-point MPI
  int commringlen;                  // length of commringlist
  int commringlen;                  // length of commringlist
  int *commringlist;                // ranks on communication ring of this plan
  int *commringlist;                // ranks on communication ring of this plan
  int usecuda_aware;                // use CUDA-Aware MPI or not
};
};


template<class DeviceType>
template<class DeviceType>
@@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  typedef FFTArrayTypes<DeviceType> FFT_AT;
  RemapKokkos(class LAMMPS *);
  RemapKokkos(class LAMMPS *);
  RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
  RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
        int,int,int,int,int,int,int,int,int,int,int);
        int,int,int,int,int,int,int,int,int,int,int,int);
  ~RemapKokkos();
  ~RemapKokkos();
  void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
  void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);


@@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
  struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
  struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int, int,
                                             int, int, int, int, int);
                                             int, int, int, int, int, int);
  void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
  void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
};
};