Unverified Commit 6bf32909 authored by Axel Kohlmeyer's avatar Axel Kohlmeyer Committed by GitHub
Browse files

Merge pull request #2216 from ndtrung81/tersoff-gpu

Cleanup and bugfixes for some 3-body pair styles in the GPU package
parents 9e96b717 ebfe7f68
Loading
Loading
Loading
Loading
+5 −10
Original line number Diff line number Diff line
@@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,

}

#define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy)  \
{                                                                            \
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
@@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  }                                                                          \
}

#define threebody_half(delr1x, delr1y, delr1z)                               \
#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z)       \
{                                                                            \
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
@@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
          sw_costheta_ijk=sw3_ijkparam.z;

          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);
          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);

          f.x -= fjx + fkx;
          f.y -= fjy + fky;
@@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
          sw_costheta_ijk=sw3_ijkparam.z;

          numtyp fjx, fjy, fjz;
          //if (evatom==0) {
            threebody_half(delr1x,delr1y,delr1z);
          //} else {
          //  numtyp fkx, fky, fkz;
          //  threebody(delr1x,delr1y,delr1z,eflag,energy);
          //}
          threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);

          f.x += fjx;
          f.y += fjy;
@@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
          sw_costheta_ijk=sw3_ijkparam.z;

          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);
          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);

          f.x += fjx;
          f.y += fjy;
+4 −5
Original line number Diff line number Diff line
@@ -250,10 +250,9 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                               (BX/this->_threads_per_atom)));

  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
                 &elem2param, &_nelements, &_nparams,
                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                 &this->dev_short_nbor, &ainum,
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                         &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &_cutshortsq, &ainum,
                         &nbor_pitch, &this->_threads_per_atom);

  // re-allocate zetaij if necessary
+11 −34
Original line number Diff line number Diff line
@@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
#endif

__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   __global int * dev_short_nbor,
                                   const numtyp _cutshortsq,
                                   const int inum, const int nbor_pitch,
                                   const int t_per_atom) {
  __local int n_stride;
@@ -185,8 +182,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

    int ncount = 0;
    int m = nbor;
@@ -200,9 +195,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delx = ix.x-jx.x;
@@ -210,7 +202,7 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<cutsq[ijparam]) {
      if (rsq<_cutshortsq) {
        dev_short_nbor[nbor_short] = nj;
        nbor_short += n_stride;
        ncount++;
@@ -460,7 +452,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      // rsq<cutsq[ijparam]
      if (rsq >= cutsq[ijparam]) continue;

      numtyp feng[2];
      numtyp ijparam_lam1 = ts1[ijparam].x;
      numtyp4 ts2_ijparam = ts2[ijparam];
@@ -574,6 +567,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
      delr1[1] = jx.y-ix.y;
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
      if (rsq1 >= cutsq[ijparam]) continue;

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
@@ -715,7 +709,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;

  __local int red_acc[2*BLOCK_PAIR];
  __local int red_acc[BLOCK_PAIR];

  __syncthreads();

@@ -749,7 +743,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delr1[3];
@@ -796,21 +789,14 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
          red_acc[2*m+0] = ijnum;
          red_acc[2*m+1] = offset_k;
          red_acc[m] = ijnum;
          break;
        }
      }

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
      int offset_kf;
      if (ijnum >= 0) {
        offset_kf = offset_k;
      } else {
        ijnum = red_acc[2*m+0];
        offset_kf = red_acc[2*m+1];
      }
      if (ijnum < 0) ijnum = red_acc[m];

      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
@@ -853,7 +839,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        delr2[2] = kx.z-jx.z;
        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

        if (rsq2 > cutsq[jikparam]) continue;
        numtyp r2 = ucl_sqrt(rsq2);
        numtyp r2inv = ucl_rsqrt(rsq2);
        numtyp4 ts1_param, ts2_param, ts4_param;
@@ -953,7 +938,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;

  __local int red_acc[2*BLOCK_PAIR];
  __local int red_acc[BLOCK_PAIR];

  __syncthreads();

@@ -987,7 +972,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delr1[3];
@@ -1034,21 +1018,14 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
          red_acc[2*m+0] = ijnum;
          red_acc[2*m+1] = offset_k;
          red_acc[m] = ijnum;
          break;
        }
      }

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
      int offset_kf;
      if (ijnum >= 0) {
        offset_kf = offset_k;
      } else {
        ijnum = red_acc[2*m+0];
        offset_kf = red_acc[2*m+1];
      }
      if (ijnum < 0) ijnum = red_acc[m];

      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
+5 −6
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in

  _allocated=true;
  this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
    ts4.row_bytes()+cutsq.row_bytes()+
    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
  return 0;
}
@@ -250,10 +250,9 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                               (BX/this->_threads_per_atom)));

  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
                 &elem2param, &_nelements, &_nparams,
                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                 &this->dev_short_nbor, &ainum,
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                         &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &_cutshortsq, &ainum,
                         &nbor_pitch, &this->_threads_per_atom);

  // re-allocate zetaij if necessary
+12 −34
Original line number Diff line number Diff line
@@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
#endif

__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   __global int * dev_short_nbor,
                                   const numtyp _cutshortsq,
                                   const int inum, const int nbor_pitch,
                                   const int t_per_atom) {
  __local int n_stride;
@@ -185,8 +182,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

    int ncount = 0;
    int m = nbor;
@@ -200,9 +195,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delx = ix.x-jx.x;
@@ -210,7 +202,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<cutsq[ijparam]) {
      if (rsq<_cutshortsq) {
        dev_short_nbor[nbor_short] = nj;
        nbor_short += n_stride;
        ncount++;
@@ -461,7 +453,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      // rsq<cutsq[ijparam]
      if (rsq >= cutsq[ijparam]) continue;

      numtyp feng[2];
      numtyp ijparam_lam1 = ts1[ijparam].x;
      numtyp4 ts2_ijparam = ts2[ijparam];
@@ -578,6 +571,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
      delr1[1] = jx.y-ix.y;
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
      if (rsq1 >= cutsq[ijparam]) continue;

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
@@ -725,7 +719,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;

  __local int red_acc[2*BLOCK_PAIR];
  __local int red_acc[BLOCK_PAIR];

  __syncthreads();

@@ -759,7 +753,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delr1[3];
@@ -806,21 +799,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
          red_acc[2*m+0] = ijnum;
          red_acc[2*m+1] = offset_k;
          red_acc[m] = ijnum;
          break;
        }
      }

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
      int offset_kf;
      if (ijnum >= 0) {
        offset_kf = offset_k;
      } else {
        ijnum = red_acc[2*m+0];
        offset_kf = red_acc[2*m+1];
      }
      if (ijnum < 0) ijnum = red_acc[m];

      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
@@ -863,7 +849,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        delr2[2] = kx.z-jx.z;
        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

        if (rsq2 > cutsq[jikparam]) continue;
        numtyp r2 = ucl_sqrt(rsq2);
        numtyp r2inv = ucl_rsqrt(rsq2);
        numtyp4 ts1_param, ts2_param, ts4_param, ts5_param;
@@ -892,6 +877,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
        int idx = nbor_k;
        if (dev_packed==dev_nbor) idx -= n_stride;

        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -971,7 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;

  __local int red_acc[2*BLOCK_PAIR];
  __local int red_acc[BLOCK_PAIR];

  __syncthreads();

@@ -1005,7 +991,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delr1[3];
@@ -1052,21 +1037,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
          red_acc[2*m+0] = ijnum;
          red_acc[2*m+1] = offset_k;
          red_acc[m] = ijnum;
          break;
        }
      }

      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);
      int offset_kf;
      if (ijnum >= 0) {
        offset_kf = offset_k;
      } else {
        ijnum = red_acc[2*m+0];
        offset_kf = red_acc[2*m+1];
      }
      if (ijnum < 0) ijnum = red_acc[m];

      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
      int idx = ijnum;
Loading