Unverified Commit 1db4705a authored by Steve Plimpton's avatar Steve Plimpton Committed by GitHub
Browse files

Merge pull request #966 from wmbrownIntel/user-intel-2018Jun

Intel Package: Bug fix when using LRT with minimize and for virial calculation with GNU... 
parents 8e77be08 2fe0eabc
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -82,7 +82,8 @@ class FixIntel : public Fix {
  }
  inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
  inline int lrt() {
    if (force->kspace_match("pppm/intel", 0)) return _lrt;
    if (force->kspace_match("pppm/intel", 0) && update->whichflag == 1) 
      return _lrt;
    else return 0;
  }
  inline int pppm_table() {
+50 −3
Original line number Diff line number Diff line
@@ -134,6 +134,20 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#define INTEL_HTHREADS 2
#endif

#if INTEL_DATA_ALIGN > 1

#define IP_PRE_edge_align(n, esize)                                     \
  {                                                                     \
    const int pad_mask = ~static_cast<int>(INTEL_DATA_ALIGN/esize-1);   \
    n = (n + INTEL_DATA_ALIGN / esize - 1) & pad_mask;                  \
  }

#else
  
#define IP_PRE_edge_align(n, esize)                                     \

#endif

#define IP_PRE_get_stride(stride, n, datasize, torque)          \
  {                                                             \
    int blength = n;                                            \
@@ -303,7 +317,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
  {                                                             \
    tid = 0;                                                    \
    ifrom = 0;                                                  \
    ip = 1;                                                     \
    ip = vecsize;                                               \
    ito = inum;                                                 \
  }

@@ -316,7 +330,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
  acc_t *f_scalar = &f_start[0].x;                                      \
  flt_t *x_scalar = &pos[minlocal].x;                                   \
  int f_stride4 = f_stride * 4;                                         \
  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);                         \
  _alignvar(acc_t ovv[16],64);                                          \
  int vwidth;                                                           \
  if (sizeof(acc_t) == sizeof(double))                                  \
    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
@@ -516,6 +530,22 @@ inline double MIC_Wtime() {
  return time;
}

#define IP_PRE_neighbor_pad(jnum, offload)                              \
{                                                                       \
  const int opad_mask = ~static_cast<int>(INTEL_MIC_NBOR_PAD *          \
                                          sizeof(float) /               \
                                          sizeof(flt_t) - 1);           \
  const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
                                          sizeof(float) /               \
                                          sizeof(flt_t) - 1);           \
  if (offload && INTEL_MIC_NBOR_PAD > 1)                                \
    jnum = (jnum + INTEL_MIC_NBOR_PAD * sizeof(float) /                 \
            sizeof(flt_t) - 1) & opad_mask;                             \
  else if (INTEL_NBOR_PAD > 1)                                          \
    jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                     \
            sizeof(flt_t) - 1) & pad_mask;                              \
}

#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                     nlocal, nall)                      \
{                                                                       \
@@ -644,6 +674,23 @@ inline double MIC_Wtime() {

#else

#if INTEL_NBOR_PAD > 1

#define IP_PRE_neighbor_pad(jnum, offload)                              \
{                                                                       \
  const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
                                         sizeof(float) /                \
                                         sizeof(flt_t) - 1);            \
  jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                       \
          sizeof(flt_t) - 1) & pad_mask;                                \
}

#else

#define IP_PRE_neighbor_pad(jnum, offload)

#endif

#define MIC_Wtime MPI_Wtime
#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                     nlocal, nall)
+98 −97
Original line number Diff line number Diff line
@@ -112,7 +112,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
  if (pend-pstart == 0) return;

  const int nall = atom->nlocal + atom->nghost;
  int pad = 1;
  int nall_t = nall;
  const int aend = nall;

@@ -207,6 +206,17 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
  const int mbinz = this->mbinz;
  const int * const stencilxyz = &this->stencilxyz[0][0];

  int sb = 1;
  if (special_flag[1] == 0) {
    sb = 2;
    if (special_flag[2] == 0) {
      sb = 3;
      if (special_flag[3] == 0)
        sb = 4;
    }
  }
  const int special_bound = sb;

  #ifdef _LMP_INTEL_OFFLOAD
  const int * _noalias const binhead = this->binhead;
  const int * _noalias const bins = this->bins;
@@ -230,7 +240,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\
    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
    in(stencilxyz:length(3*nstencil)) \
    out(overflow:length(5) alloc_if(0) free_if(0)) \
@@ -287,8 +297,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
      int e_ito = ito;
      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;

      int which;

      int pack_offset = maxnbors;
      int ct = (ifrom + tid * 2) * maxnbors;
      int *neighptr = firstneigh + ct;
@@ -418,41 +426,109 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          }
        } // for u

        if (molecular && i < nlocal) {
          int alln = n;
          n = 0;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
          #pragma simd
          #endif
          #endif
          for (int u = 0; u < alln; u++) {
            int which;
            int addme = 1;
            int j = neighptr[u];
            if (need_ic && j < 0) {
              which = 0;
              j = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            if (which) {
              j = j ^ (which << SBBITS);
              if (which < special_bound) addme = 0;
            }
            #ifdef LMP_INTEL_NBOR_COMPAT
            if (addme) neighptr2[n++] = j;
            #else
            neighptr2[n++] = j;
            #endif
          }
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
          #pragma simd
          #endif
          #endif
          for (int u = n2; u < alln; u++) {
            int which;
            int addme = 1;
            int j = neighptr[u];
            if (need_ic && j < 0) {
              which = 0;
              j = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            if (which) {
              j = j ^ (which << SBBITS);
              if (which < special_bound) addme = 0;
            }
            #ifdef LMP_INTEL_NBOR_COMPAT
            if (addme) neighptr2[n2++] = j;
            #else
            neighptr2[n2++] = j;
            #endif
          }
        }
       
        #ifndef _LMP_INTEL_OFFLOAD
        if (exclude) {
          int alln = n;
          n = maxnbors;
          for (int u = pack_offset; u < alln; u++) {
            const int j = neighptr[u];
            int pj = j;
            if (need_ic)
              if (pj < 0) pj = -j - 1;
            const int jtype = x[pj].w;
            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
            neighptr[n++] = j;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma ivdep
          #endif
          for (int u = n; u < alln; u++) {
            int addme = 1;
            const int js = neighptr[u];
            const int j = js & NEIGHMASK;
            const int jtype = x[j].w;
            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
            if (addme) neighptr2[n++] = js;
          }
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma ivdep
          #endif
          for (int u = n2; u < alln; u++) {
            const int j = neighptr[u];
            int pj = j;
            if (need_ic)
              if (pj < 0) pj = -j - 1;
            const int jtype = x[pj].w;
            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
            neighptr[n2++] = j;
            int addme = 1;
            const int js = neighptr[u];
            const int j = js & NEIGHMASK;
            const int jtype = x[j].w;
            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
            if (addme) neighptr2[n2++] = js;
          }
        }
        #endif
        
        int ns = n - maxnbors;
        int alln = n;
        atombin[i] = ns;
        n = 0;
        for (int u = maxnbors; u < alln; u++)
          neighptr[n++] = neighptr[u];
          neighptr[n++] = neighptr2[u];
        ns += n2 - maxnbors * 2;
        for (int u = maxnbors * 2; u < n2; u++)
          neighptr[n++] = neighptr[u];
          neighptr[n++] = neighptr2[u];
        if (ns > maxnbors) *overflow = 1;

        ilist[i] = i;
@@ -460,9 +536,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
        numneigh[i] = ns;

        ct += ns;
        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
        const int edge = ct & (alignb - 1);
        if (edge) ct += alignb - edge;
        IP_PRE_edge_align(ct, sizeof(int));
        neighptr = firstneigh + ct;
        if (ct + obound > list_size) {
          if (i < ito - 1) {
@@ -477,84 +551,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          numneigh[i] = 0;

      #ifdef _LMP_INTEL_OFFLOAD
      int ghost_offset = 0, nall_offset = e_nall;
      if (separate_buffers) {
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          #if __INTEL_COMPILER+0 > 1499
          #pragma vector aligned
          #pragma simd
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            int j = jlist[jj];
            if (need_ic && j < 0) j = -j - 1;
          }
        }

        overflow[LMP_LOCAL_MIN] = 0;
        overflow[LMP_LOCAL_MAX] = nlocal - 1;
        overflow[LMP_GHOST_MIN] = nlocal;
        overflow[LMP_GHOST_MAX] = e_nall - 1;

        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
        if (nghost < 0) nghost = 0;
        if (offload) {
          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
        } else {
          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
          nall_offset = nlocal + nghost;
        }
      } // if separate_buffers
      #endif

      if (molecular) {
        int ito_m = ito;
        if (ito >= nlocal) ito_m = nlocal;
        for (int i = ifrom; i < ito_m; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];

          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma simd
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj];
            if (need_ic && j < 0) {
              which = 0;
              jlist[jj] = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            #ifdef _LMP_INTEL_OFFLOAD
            if (j >= nlocal) {
              if (j == e_nall)
                jlist[jj] = nall_offset;
              else if (which)
                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
              else jlist[jj]-=ghost_offset;
            } else
            #endif
            if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        } // for i
      } // if molecular
      #ifdef _LMP_INTEL_OFFLOAD
      else if (separate_buffers) {
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          int jj = 0;
          #pragma vector aligned
          #pragma simd
          for (jj = 0; jj < jnum; jj++) {
            if (jlist[jj] >= nlocal) {
              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
              else jlist[jj] -= ghost_offset;
            }
          }
        }
      }
      #endif
    } // end omp
+165 −122

File changed.

Preview size limit exceeded, changes collapsed.

+7 −3
Original line number Diff line number Diff line
@@ -142,6 +142,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
  ATOM_T * _noalias const x = buffers->get_x(offload);
  flt_t * _noalias const q = buffers->get_q(offload);

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -185,6 +186,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
    in(overflow:length(0) alloc_if(0) free_if(0)) \
    in(astart,nthreads,qqrd2e,inum,nall,ntypes,vflag,eatom) \
    in(f_stride,nlocal,minlocal,separate_flag,offload) \
@@ -221,15 +223,17 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
      FORCE_T * _noalias const f = f_start + foff;
      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

      for (int i = iifrom; i < iito; i += iip) {
      for (int ii = iifrom; ii < iito; ii += iip) {
        const int i = ilist[ii];
        const int itype = x[i].w;

        const int ptr_off = itype * ntypes;
        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
        const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];
        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
        int jnum = numneigh[ii];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
Loading