Fixing bug in pppm/intel for AVX-512 with single precision and ik diff. (7e58f084) · Commits · 郑智淋 / lammps

src/USER-INTEL/pppm_intel.cpp

+16 −264

Original line number	Diff line number	Diff line
		@@ -66,11 +66,7 @@ PPPMIntel::PPPMIntel(LAMMPS lmp, int narg, char *arg) : PPPM(lmp, narg, arg)
		rho_lookup = drho_lookup = NULL;
		rho_points = 0;

		vdxy_brick = vdz0_brick = NULL;
		work3 = NULL;
		cg_pack = NULL;

		_use_table = _use_packing = _use_lrt = 0;
		_use_table = _use_lrt = 0;
		}

		PPPMIntel::~PPPMIntel()
		@@ -82,12 +78,6 @@ PPPMIntel::~PPPMIntel()

		memory->destroy(rho_lookup);
		memory->destroy(drho_lookup);

		memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
		memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
		memory->destroy(work3);

		delete cg_pack;
		}

		/* ----------------------------------------------------------------------
		@@ -141,36 +131,6 @@ void PPPMIntel::init()
		if (order > INTEL_P3M_MAXORDER)
		error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");

		_use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
		&& (sizeof(FFT_SCALAR) == sizeof(float))
		&& (differentiation_flag == 0);
		if (_use_packing) {
		memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
		memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
		memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
		memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
		create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
		nylo_out, nyhi_out, 2nxlo_out, 2nxhi_out+1,
		"pppmintel:vdxy_brick");
		memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
		create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
		nylo_out, nyhi_out, 2nxlo_out, 2nxhi_out+1,
		"pppmintel:vdz0_brick");
		memory->destroy(work3);
		memory->create(work3, 2*nfft_both, "pppmintel:work3");

		// new communicator for the double-size bricks
		delete cg_pack;
		int (*procneigh)[2] = comm->procneigh;
		cg_pack = new GridComm(lmp,world,2,0, 2nxlo_in,2nxhi_in+1,nylo_in,
		nyhi_in,nzlo_in,nzhi_in, 2nxlo_out,2nxhi_out+1,
		nylo_out,nyhi_out,nzlo_out,nzhi_out,
		procneigh[0][0],procneigh[0][1],procneigh[1][0],
		procneigh[1][1],procneigh[2][0],procneigh[2][1]);

		cg_pack->ghost_notify();
		cg_pack->setup();
		}
		}

		/* ----------------------------------------------------------------------
		@@ -272,18 +232,13 @@ void PPPMIntel::compute_first(int eflag, int vflag)
		// also performs per-atom calculations via poisson_peratom()

		if (differentiation_flag == 1) poisson_ad();
		else poisson_ik_intel();
		else poisson_ik();

		// all procs communicate E-field values
		// to fill ghost cells surrounding their 3d bricks

		if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
		else {
		if (_use_packing)
		cg_pack->forward_comm(this,FORWARD_IK);
		else
		cg->forward_comm(this,FORWARD_IK);
		}
		else cg->forward_comm(this,FORWARD_IK);

		// extra per-atom energy/virial communication

		@@ -604,7 +559,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
		interpolate from grid to get electric field & force on my particles for ik
		------------------------------------------------------------------------- */

		template<class flt_t, class acc_t, int use_table, int use_packing>
		template<class flt_t, class acc_t, int use_table>
		void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
		{
		// loop over my charges, interpolate electric field from nearby grid points
		@@ -649,9 +604,9 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
		int ny = part2grid[i][1];
		int nz = part2grid[i][2];

		int nxsum = (use_packing ? 2 : 1) * (nx + nlower);
		int nxsum = nx + nlower;
		int nysum = ny + nlower;
		int nzsum = nz + nlower;;
		int nzsum = nz + nlower;

		FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
		FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
		@@ -668,12 +623,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
		#pragma simd
		#endif
		for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
		if (use_packing) {
		rho0[2 * k] = rho_lookup[idx][k];
		rho0[2 * k + 1] = rho_lookup[idx][k];
		} else {
		rho0[k] = rho_lookup[idx][k];
		}
		rho1[k] = rho_lookup[idy][k];
		rho2[k] = rho_lookup[idz][k];
		}
		@@ -690,12 +640,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
		r2 = rho_coeff[l][k] + r2*dy;
		r3 = rho_coeff[l][k] + r3*dz;
		}
		if (use_packing) {
		rho0[2 * (k-nlower)] = r1;
		rho0[2 * (k-nlower) + 1] = r1;
		} else {
		rho0[k-nlower] = r1;
		}
		rho1[k-nlower] = r2;
		rho2[k-nlower] = r3;
		}
		@@ -722,38 +667,24 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
		#if defined(LMP_SIMD_COMPILER)
		#pragma simd
		#endif
		for (int l = 0; l < (use_packing ? 2 : 1) *
		INTEL_P3M_ALIGNED_MAXORDER; l++) {
		for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
		int mx = l+nxsum;
		FFT_SCALAR x0 = y0*rho0[l];
		if (use_packing) {
		ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx];
		ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx];
		} else {
		ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
		eky_arr[l] -= x0*vdy_brick[mz][my][mx];
		ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
		}
		}
		}
		}

		FFT_SCALAR ekx, eky, ekz;
		ekx = eky = ekz = ZEROF;

		if (use_packing) {
		for (int l = 0; l < 2*order; l += 2) {
		ekx += ekxy_arr[l];
		eky += ekxy_arr[l+1];
		ekz += ekz0_arr[l];
		}
		} else {
		for (int l = 0; l < order; l++) {
		ekx += ekx_arr[l];
		eky += eky_arr[l];
		ekz += ekz_arr[l];
		}
		}

		// convert E-field to force

		@@ -965,137 +896,6 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
		}
		}

		/* ----------------------------------------------------------------------
		FFT-based Poisson solver for ik
		Does special things for packing mode to avoid repeated copies
		------------------------------------------------------------------------- */

		void PPPMIntel::poisson_ik_intel()
		{
		if (_use_packing == 0) {
		poisson_ik();
		return;
		}

		int i,j,k,n;
		double eng;

		// transform charge density (r -> k)

		n = 0;
		for (i = 0; i < nfft; i++) {
		work1[n++] = density_fft[i];
		work1[n++] = ZEROF;
		}

		fft1->compute(work1,work1,1);

		// global energy and virial contribution

		double scaleinv = 1.0/(nx_pppmny_pppmnz_pppm);
		double s2 = scaleinv*scaleinv;

		if (eflag_global \|\| vflag_global) {
		if (vflag_global) {
		n = 0;
		for (i = 0; i < nfft; i++) {
		eng = s2 * greensfn[i] * (work1[n]*work1[n] +
		work1[n+1]*work1[n+1]);
		for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
		if (eflag_global) energy += eng;
		n += 2;
		}
		} else {
		n = 0;
		for (i = 0; i < nfft; i++) {
		energy +=
		s2 * greensfn[i] * (work1[n]work1[n] + work1[n+1]work1[n+1]);
		n += 2;
		}
		}
		}

		// scale by 1/total-grid-pts to get rho(k)
		// multiply by Green's function to get V(k)

		n = 0;
		for (i = 0; i < nfft; i++) {
		work1[n++] = scaleinv greensfn[i];
		work1[n++] = scaleinv greensfn[i];
		}

		// extra FFTs for per-atom energy/virial

		if (evflag_atom) poisson_peratom();

		// triclinic system

		if (triclinic) {
		poisson_ik_triclinic();
		return;
		}

		// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
		// FFT leaves data in 3d brick decomposition
		// copy it into inner portion of vdx,vdy,vdz arrays

		// x direction gradient
		n = 0;
		for (k = nzlo_fft; k <= nzhi_fft; k++)
		for (j = nylo_fft; j <= nyhi_fft; j++)
		for (i = nxlo_fft; i <= nxhi_fft; i++) {
		work2[n] = fkx[i]*work1[n+1];
		work2[n+1] = -fkx[i]*work1[n];
		n += 2;
		}

		fft2->compute(work2,work2,-1);

		// y direction gradient

		n = 0;
		for (k = nzlo_fft; k <= nzhi_fft; k++)
		for (j = nylo_fft; j <= nyhi_fft; j++)
		for (i = nxlo_fft; i <= nxhi_fft; i++) {
		work3[n] = fky[j]*work1[n+1];
		work3[n+1] = -fky[j]*work1[n];
		n += 2;
		}

		fft2->compute(work3,work3,-1);

		n = 0;
		for (k = nzlo_in; k <= nzhi_in; k++)
		for (j = nylo_in; j <= nyhi_in; j++)
		for (i = nxlo_in; i <= nxhi_in; i++) {
		vdxy_brick[k][j][2*i] = work2[n];
		vdxy_brick[k][j][2*i+1] = work3[n];
		n += 2;
		}

		// z direction gradient

		n = 0;
		for (k = nzlo_fft; k <= nzhi_fft; k++)
		for (j = nylo_fft; j <= nyhi_fft; j++)
		for (i = nxlo_fft; i <= nxhi_fft; i++) {
		work2[n] = fkz[k]*work1[n+1];
		work2[n+1] = -fkz[k]*work1[n];
		n += 2;
		}

		fft2->compute(work2,work2,-1);

		n = 0;
		for (k = nzlo_in; k <= nzhi_in; k++)
		for (j = nylo_in; j <= nyhi_in; j++)
		for (i = nxlo_in; i <= nxhi_in; i++) {
		vdz0_brick[k][j][2*i] = work2[n];
		vdz0_brick[k][j][2*i+1] = 0.;
		n += 2;
		}
		}

		/* ----------------------------------------------------------------------
		precompute rho coefficients as a lookup table to save time in make_rho
		and fieldforce. Instead of doing this polynomial for every atom 6 times
		@@ -1141,46 +941,6 @@ void PPPMIntel::precompute_rho()
		}
		}

		/* ----------------------------------------------------------------------
		pack own values to buf to send to another proc
		------------------------------------------------------------------------- */

		void PPPMIntel::pack_forward(int flag, FFT_SCALAR buf, int nlist, int list)
		{
		int n = 0;

		if ((flag == FORWARD_IK) && _use_packing) {
		FFT_SCALAR xsrc = &vdxy_brick[nzlo_out][nylo_out][2nxlo_out];
		FFT_SCALAR zsrc = &vdz0_brick[nzlo_out][nylo_out][2nxlo_out];
		for (int i = 0; i < nlist; i++) {
		buf[n++] = xsrc[list[i]];
		buf[n++] = zsrc[list[i]];
		}
		} else {
		PPPM::pack_forward(flag, buf, nlist, list);
		}
		}

		/* ----------------------------------------------------------------------
		unpack another proc's own values from buf and set own ghost values
		------------------------------------------------------------------------- */

		void PPPMIntel::unpack_forward(int flag, FFT_SCALAR buf, int nlist, int list)
		{
		int n = 0;

		if ((flag == FORWARD_IK) && _use_packing) {
		FFT_SCALAR xdest = &vdxy_brick[nzlo_out][nylo_out][2nxlo_out];
		FFT_SCALAR zdest = &vdz0_brick[nzlo_out][nylo_out][2nxlo_out];
		for (int i = 0; i < nlist; i++) {
		xdest[list[i]] = buf[n++];
		zdest[list[i]] = buf[n++];
		}
		} else {
		PPPM::unpack_forward(flag, buf, nlist, list);
		}
		}

		/* ----------------------------------------------------------------------
		memory usage of local arrays
		------------------------------------------------------------------------- */
		@@ -1201,14 +961,6 @@ double PPPMIntel::memory_usage()
		bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
		}
		}
		if (_use_packing) {
		bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
		* (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
		bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
		* (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
		bytes += 2 * nfft_both * sizeof(FFT_SCALAR);
		bytes += cg_pack->memory_usage();
		}
		return bytes;
		}

src/USER-INTEL/pppm_intel.h

+3 −20

Original line number	Diff line number	Diff line
		@@ -38,8 +38,6 @@ class PPPMIntel : public PPPM {
		virtual ~PPPMIntel();
		virtual void init();
		virtual void compute(int, int);
		virtual void pack_forward(int, FFT_SCALAR , int, int );
		virtual void unpack_forward(int, FFT_SCALAR , int, int );
		virtual double memory_usage();
		void compute_first(int, int);
		void compute_second(int, int);
		@@ -64,12 +62,6 @@ class PPPMIntel : public PPPM {
		FFT_SCALAR **drho_lookup;
		FFT_SCALAR half_rho_scale, half_rho_scale_plus;

		int _use_packing;
		FFT_SCALAR ***vdxy_brick;
		FFT_SCALAR ***vdz0_brick;
		FFT_SCALAR *work3;
		class GridComm *cg_pack;

		#ifdef _LMP_INTEL_OFFLOAD
		int _use_base;
		#endif
		@@ -92,23 +84,14 @@ class PPPMIntel : public PPPM {
		make_rho<flt_t,acc_t,0>(buffers);
		}
		}
		void poisson_ik_intel();
		template<class flt_t, class acc_t, int use_table, int use_packing>
		template<class flt_t, class acc_t, int use_table>
		void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
		template<class flt_t, class acc_t>
		void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
		if (_use_table == 1) {
		if (_use_packing == 1) {
		fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
		} else {
		fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
		}
		} else {
		if (_use_packing == 1) {
		fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
		fieldforce_ik<flt_t, acc_t, 1>(buffers);
		} else {
		fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
		}
		fieldforce_ik<flt_t, acc_t, 0>(buffers);
		}
		}
		template<class flt_t, class acc_t, int use_table>

Admin message