Commit 7e58f084 authored by Michael Brown's avatar Michael Brown
Browse files

Fixing bug in pppm/intel for AVX-512 with single precision and ik diff.

This commit simplifies the code by eliminating packing optimizations that were slower
under typical parameters and had some memory bugs.
parent f2c11727
Loading
Loading
Loading
Loading
+16 −264
Original line number Diff line number Diff line
@@ -66,11 +66,7 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
  rho_lookup = drho_lookup = NULL;
  rho_points = 0;

  vdxy_brick = vdz0_brick = NULL;
  work3 = NULL;
  cg_pack = NULL;

  _use_table = _use_packing = _use_lrt = 0;
  _use_table = _use_lrt = 0;
}

PPPMIntel::~PPPMIntel()
@@ -82,12 +78,6 @@ PPPMIntel::~PPPMIntel()

  memory->destroy(rho_lookup);
  memory->destroy(drho_lookup);

  memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
  memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
  memory->destroy(work3);

  delete cg_pack;
}

/* ----------------------------------------------------------------------
@@ -141,36 +131,6 @@ void PPPMIntel::init()
  if (order > INTEL_P3M_MAXORDER)
    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");

  _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
                              && (sizeof(FFT_SCALAR) == sizeof(float))
                              && (differentiation_flag == 0);
  if (_use_packing) {
    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
    create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
		    "pppmintel:vdxy_brick");
    memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
    create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
		    "pppmintel:vdz0_brick");
    memory->destroy(work3);
    memory->create(work3, 2*nfft_both, "pppmintel:work3");

    // new communicator for the double-size bricks
    delete cg_pack;
    int (*procneigh)[2] = comm->procneigh;
    cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
                           nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
                           nylo_out,nyhi_out,nzlo_out,nzhi_out,
                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);

    cg_pack->ghost_notify();
    cg_pack->setup();
  }
}

/* ----------------------------------------------------------------------
@@ -272,18 +232,13 @@ void PPPMIntel::compute_first(int eflag, int vflag)
  // also performs per-atom calculations via poisson_peratom()

  if (differentiation_flag == 1) poisson_ad();
  else poisson_ik_intel();
  else poisson_ik();

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
  else {
    if (_use_packing)
      cg_pack->forward_comm(this,FORWARD_IK);
    else
      cg->forward_comm(this,FORWARD_IK);
  }
  else cg->forward_comm(this,FORWARD_IK);

  // extra per-atom energy/virial communication

@@ -604,7 +559,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
   interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */

template<class flt_t, class acc_t, int use_table, int use_packing>
template<class flt_t, class acc_t, int use_table>
void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
{
  // loop over my charges, interpolate electric field from nearby grid points
@@ -649,9 +604,9 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
      int ny = part2grid[i][1];
      int nz = part2grid[i][2];

      int nxsum = (use_packing ? 2 : 1) * (nx + nlower);
      int nxsum = nx + nlower;
      int nysum = ny + nlower;
      int nzsum = nz + nlower;;
      int nzsum = nz + nlower;

      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
@@ -668,12 +623,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
        #pragma simd
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          if (use_packing) {
            rho0[2 * k] = rho_lookup[idx][k];
            rho0[2 * k + 1] = rho_lookup[idx][k];
          } else {
	  rho0[k] = rho_lookup[idx][k];
          }
          rho1[k] = rho_lookup[idy][k];
          rho2[k] = rho_lookup[idz][k];
        }
@@ -690,12 +640,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
            r2 = rho_coeff[l][k] + r2*dy;
            r3 = rho_coeff[l][k] + r3*dz;
          }
          if (use_packing) {
            rho0[2 * (k-nlower)] = r1;
            rho0[2 * (k-nlower) + 1] = r1;
          } else {
	  rho0[k-nlower] = r1;
          }
          rho1[k-nlower] = r2;
          rho2[k-nlower] = r3;
        }
@@ -722,38 +667,24 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
          #if defined(LMP_SIMD_COMPILER)
          #pragma simd
          #endif
          for (int l = 0; l < (use_packing ? 2 : 1) *
                 INTEL_P3M_ALIGNED_MAXORDER; l++) {
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
            if (use_packing) {
              ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx];
              ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx];
            } else {
	    ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
	    eky_arr[l] -= x0*vdy_brick[mz][my][mx];
	    ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
          }
        }
      }
      }

      FFT_SCALAR ekx, eky, ekz;
      ekx = eky = ekz = ZEROF;

      if (use_packing) {
        for (int l = 0; l < 2*order; l += 2) {
          ekx += ekxy_arr[l];
          eky += ekxy_arr[l+1];
          ekz += ekz0_arr[l];
        }
      } else {
      for (int l = 0; l < order; l++) {
	ekx += ekx_arr[l];
	eky += eky_arr[l];
	ekz += ekz_arr[l];
      }
      }

      // convert E-field to force

@@ -965,137 +896,6 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
  }
}

/* ----------------------------------------------------------------------
   FFT-based Poisson solver for ik
   Does special things for packing mode to avoid repeated copies
------------------------------------------------------------------------- */

void PPPMIntel::poisson_ik_intel()
{
  if (_use_packing == 0) {
    poisson_ik();
    return;
  }

  int i,j,k,n;
  double eng;

  // transform charge density (r -> k)

  n = 0;
  for (i = 0; i < nfft; i++) {
    work1[n++] = density_fft[i];
    work1[n++] = ZEROF;
  }

  fft1->compute(work1,work1,1);

  // global energy and virial contribution

  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
  double s2 = scaleinv*scaleinv;

  if (eflag_global || vflag_global) {
    if (vflag_global) {
      n = 0;
      for (i = 0; i < nfft; i++) {
        eng = s2 * greensfn[i] * (work1[n]*work1[n] +
                                  work1[n+1]*work1[n+1]);
        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
        if (eflag_global) energy += eng;
        n += 2;
      }
    } else {
      n = 0;
      for (i = 0; i < nfft; i++) {
        energy +=
          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
        n += 2;
      }
    }
  }

  // scale by 1/total-grid-pts to get rho(k)
  // multiply by Green's function to get V(k)

  n = 0;
  for (i = 0; i < nfft; i++) {
    work1[n++] *= scaleinv * greensfn[i];
    work1[n++] *= scaleinv * greensfn[i];
  }

  // extra FFTs for per-atom energy/virial

  if (evflag_atom) poisson_peratom();

  // triclinic system

  if (triclinic) {
    poisson_ik_triclinic();
    return;
  }

  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
  // FFT leaves data in 3d brick decomposition
  // copy it into inner portion of vdx,vdy,vdz arrays

  // x direction gradient
  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work2[n] = fkx[i]*work1[n+1];
        work2[n+1] = -fkx[i]*work1[n];
        n += 2;
      }

  fft2->compute(work2,work2,-1);

  // y direction gradient

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work3[n] = fky[j]*work1[n+1];
        work3[n+1] = -fky[j]*work1[n];
        n += 2;
      }

  fft2->compute(work3,work3,-1);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdxy_brick[k][j][2*i] = work2[n];
        vdxy_brick[k][j][2*i+1] = work3[n];
        n += 2;
      }

  // z direction gradient

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work2[n] = fkz[k]*work1[n+1];
        work2[n+1] = -fkz[k]*work1[n];
        n += 2;
      }

  fft2->compute(work2,work2,-1);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdz0_brick[k][j][2*i] = work2[n];
        vdz0_brick[k][j][2*i+1] = 0.;
        n += 2;
      }
}

/* ----------------------------------------------------------------------
   precompute rho coefficients as a lookup table to save time in make_rho
   and fieldforce.  Instead of doing this polynomial for every atom 6 times
@@ -1141,46 +941,6 @@ void PPPMIntel::precompute_rho()
  }
}

/* ----------------------------------------------------------------------
   pack own values to buf to send to another proc
------------------------------------------------------------------------- */

void PPPMIntel::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
  int n = 0;

  if ((flag == FORWARD_IK) && _use_packing) {
    FFT_SCALAR *xsrc = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
    FFT_SCALAR *zsrc = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
    for (int i = 0; i < nlist; i++) {
      buf[n++] = xsrc[list[i]];
      buf[n++] = zsrc[list[i]];
    }
  } else {
    PPPM::pack_forward(flag, buf, nlist, list);
  }
}

/* ----------------------------------------------------------------------
   unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */

void PPPMIntel::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
  int n = 0;

  if ((flag == FORWARD_IK) && _use_packing) {
    FFT_SCALAR *xdest = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
    FFT_SCALAR *zdest = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
    for (int i = 0; i < nlist; i++) {
      xdest[list[i]] = buf[n++];
      zdest[list[i]] = buf[n++];
    }
  } else {
    PPPM::unpack_forward(flag, buf, nlist, list);
  }
}

/* ----------------------------------------------------------------------
   memory usage of local arrays
------------------------------------------------------------------------- */
@@ -1201,14 +961,6 @@ double PPPMIntel::memory_usage()
      bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
    }
  }
  if (_use_packing) {
    bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
               * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
    bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
               * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);
    bytes += cg_pack->memory_usage();
  }
  return bytes;
}

+3 −20
Original line number Diff line number Diff line
@@ -38,8 +38,6 @@ class PPPMIntel : public PPPM {
  virtual ~PPPMIntel();
  virtual void init();
  virtual void compute(int, int);
  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
  virtual double memory_usage();
  void compute_first(int, int);
  void compute_second(int, int);
@@ -64,12 +62,6 @@ class PPPMIntel : public PPPM {
  FFT_SCALAR **drho_lookup;
  FFT_SCALAR half_rho_scale, half_rho_scale_plus;

  int _use_packing;
  FFT_SCALAR ***vdxy_brick;
  FFT_SCALAR ***vdz0_brick;
  FFT_SCALAR *work3;
  class GridComm *cg_pack;

  #ifdef _LMP_INTEL_OFFLOAD
  int _use_base;
  #endif
@@ -92,23 +84,14 @@ class PPPMIntel : public PPPM {
      make_rho<flt_t,acc_t,0>(buffers);
    }
  }
  void poisson_ik_intel();
  template<class flt_t, class acc_t, int use_table, int use_packing>
  template<class flt_t, class acc_t, int use_table>
  void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
  template<class flt_t, class acc_t>
  void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
    if (_use_table == 1) {
      if (_use_packing == 1) {
        fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
      } else {
        fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
      }
    } else {
      if (_use_packing == 1) {
        fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
      fieldforce_ik<flt_t, acc_t, 1>(buffers);
    } else {
        fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
      }
      fieldforce_ik<flt_t, acc_t, 0>(buffers);
    }
  }
  template<class flt_t, class acc_t, int use_table>