Commit 00aafef1 authored by Steve Plimpton's avatar Steve Plimpton Committed by GitHub
Browse files

Merge pull request #597 from ndtrung81/three-body-short-nlist

Implementing short neighbor lists for three-body gpu styles
parents 0b3f1b8a 3e9b41c6
Loading
Loading
Loading
Loading
+12 −12
Original line number Diff line number Diff line
@@ -22,21 +22,21 @@
  offset=tid & (t_per_atom-1);                                               \
  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
                  i, numj, stride, nbor_end, nbor_begin)                     \
  i=nbor_mem[ii];                                                            \
  nbor_begin=ii+nbor_stride;                                                 \
  numj=nbor_mem[nbor_begin];                                                 \
  if (nbor_mem==packed_mem) {                                                \
    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
    stride=fast_mul(t_per_atom,nbor_stride);                                 \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
                  i, numj, n_stride, nbor_end, nbor_begin)                   \
  i=dev_nbor[ii];                                                            \
  nbor_begin=ii+nbor_pitch;                                                  \
  numj=dev_nbor[nbor_begin];                                                 \
  if (dev_nbor==dev_packed) {                                                \
    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
    nbor_begin+=offset;                                                      \
  } else {                                                                   \
    nbor_begin+=nbor_stride;                                                 \
    nbor_begin=nbor_mem[nbor_begin];                                         \
    nbor_begin+=nbor_pitch;                                                  \
    nbor_begin=dev_nbor[nbor_begin];                                         \
    nbor_end=nbor_begin+numj;                                                \
    stride=t_per_atom;                                                       \
    n_stride=t_per_atom;                                                     \
    nbor_begin+=offset;                                                      \
  }

+44 −13
Original line number Diff line number Diff line
@@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
                           const double cell_size, const double gpu_split,
                           FILE *_screen, const void *pair_program,
                           const char *k_two, const char *k_three_center,
                           const char *k_three_end) {
                           const char *two, const char *three_center,
                           const char *three_end, const char *short_nbor) {
  screen=_screen;

  int gpu_nbor=0;
@@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_atom();
  if (_threads_per_atom>1 && gpu_nbor==0) {
  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
  } else  // neigh yes or tpa == 1
    _nbor_data=&(nbor->dev_nbor);
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;
@@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,

  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  _max_an_bytes+=ans2->gpu_bytes();
  #endif

  int ef_nall=nall;
  if (ef_nall==0)
    ef_nall=2000;
  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);

  return 0;
}

@@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
    k_three_end.clear();
    k_three_end_vatom.clear();
    k_pair.clear();
    k_short_nbor.clear();
    delete pair_program;
    _compiled=false;
  }
@@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
  time_pair.clear();
  hd_balancer.clear();

  dev_short_nbor.clear();
  nbor->clear();
  ans->clear();
  #ifdef THREE_CONCURRENT
@@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  if (!success)
    return NULL;

  _nall = nall;

  // originally the requirement that nall == nlist was enforced
  // to allow direct indexing neighbors of neighbors after re-arrangement
//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
    return 0;
  atom->cast_copy_x(host_x,host_type);

  _nall = nall;

  int mn;
  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
@@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
    reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
    if (!success)
      return;
    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
  }

  atom->cast_x_data(host_x,host_type);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);

  // re-allocate dev_short_nbor if necessary
  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
    dev_short_nbor.resize((2+_max_nbors)*_nmax);
  }

  // _ainum to be used in loop() for short neighbor list build
  _ainum = nlist;

  int evatom=0;
  if (eatom || vatom)
    evatom=1;
@@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,

  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
@@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();

  // re-allocate dev_short_nbor if necessary
  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
    dev_short_nbor.resize((2+_max_nbors)*_nmax);
  }

  // _ainum to be used in loop() for short neighbor list build
  _ainum = nall;

  int evatom=0;
  if (eatom || vatom)
    evatom=1;
@@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {

template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                 const char *ktwo, const char *kthree_center,
                                 const char *kthree_end) {
                                 const char *two, const char *three_center,
                                 const char *three_end, const char* short_nbor) {
  if (_compiled)
    return;

  std::string vatom_name=std::string(kthree_end)+"_vatom";
  std::string vatom_name=std::string(three_end)+"_vatom";

  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_three_center.set_function(*pair_program,kthree_center);
  k_three_end.set_function(*pair_program,kthree_end);
  k_three_center.set_function(*pair_program,three_center);
  k_three_end.set_function(*pair_program,three_end);
  k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
  k_pair.set_function(*pair_program,ktwo);
  k_pair.set_function(*pair_program,two);
  k_short_nbor.set_function(*pair_program,short_nbor);
  pos_tex.get_texture(*pair_program,"pos_tex");

  #ifdef THREE_CONCURRENT
+13 −16
Original line number Diff line number Diff line
@@ -56,7 +56,8 @@ class BaseThree {
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *screen,
                 const void *pair_program, const char *k_two,
                 const char *k_three_center, const char *k_three_end);
                 const char *k_three_center, const char *k_three_end,
                 const char *k_short_nbor=NULL);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@@ -73,18 +74,18 @@ class BaseThree {
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
  /** \param inum number of particles whose nbors must be stored on device
    * \param max_nbors maximum number of neighbors
    * \param success set to false if insufficient memory
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
  /** \param inum number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \param max_nbors current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
@@ -143,14 +144,6 @@ class BaseThree {
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);

  /// Pair loop with device neighboring
  int * compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success);

  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
@@ -193,6 +186,9 @@ class BaseThree {
  /// Neighbor data
  Neighbor *nbor;

  UCL_D_Vec<int> dev_short_nbor;
  UCL_Kernel k_short_nbor;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@@ -207,12 +203,13 @@ class BaseThree {
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
  int _gpu_nbor;
  double _max_bytes, _max_an_bytes;
  int _max_nbors, _ainum, _nall;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
                       const char *k_two, const char *k_three_center,
                       const char *k_three_end);
                       const char *two, const char *three_center,
                       const char *three_end, const char* short_nbor);

  virtual void loop(const bool _eflag, const bool _vflag,
                    const int evatom) = 0;
+19 −7
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,sw,"k_sw","k_sw_three_center",
                           "k_sw_three_end");
                           "k_sw_three_end","k_sw_short_nbor");
  if (success!=0)
    return success;

@@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
  // build the short neighbor list
  int ainum=this->_ainum;
  int nbor_pitch=this->nbor->nbor_pitch();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                               (BX/this->_threads_per_atom)));
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                 &this->dev_short_nbor, &ainum,
                 &nbor_pitch, &this->_threads_per_atom);

  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  this->time_pair.start();
  
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                   &map, &elem2param, &_nelements,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum, &nbor_pitch,
                   &this->_threads_per_atom);
@@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);

@@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->nbor->dev_acc,
                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->nbor->dev_acc,
                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

+133 −16
Original line number Diff line number Diff line
@@ -130,6 +130,63 @@ texture<int4> sw3_tex;

#endif

__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict sw3,
                           const __global int *restrict map,
                           const __global int *restrict elem2param,
                           const int nelements,
                           const __global int * dev_nbor,
                           const __global int * dev_packed,
                           __global int * dev_short_nbor,
                           const int inum, const int nbor_pitch, const int t_per_atom) {
  __local int n_stride;
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

    int ncount = 0;
    int m = nbor;
    dev_short_nbor[m] = 0;
    int nbor_short = nbor+n_stride;

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int j=dev_packed[nbor];
      int nj = j;
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];

      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
        dev_short_nbor[nbor_short] = nj;
        nbor_short += n_stride;
        ncount++;
      }
    } // for nbor

    // store the number of neighbors for each thread
    dev_short_nbor[m] = ncount;

  } // if ii
}

__kernel void k_sw(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict sw1,
@@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const int nelements,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
                   const __global int * dev_short_nbor,
                   __global acctyp4 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
@@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    int nbor, nbor_end, i, numj;
    const int* nbor_mem = dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int j=dev_packed[nbor];
      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const int nelements,
                                const __global int * dev_nbor,
                                const __global int * dev_packed,
                                const __global int * dev_short_nbor,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
@@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;

    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor_j];
      nbor_j += n_stride;
      nbor_end = nbor_j+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }
    int nborj_start = nbor_j;

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

      int j=dev_packed[nbor_j];
      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

      int nbor_k=nbor_j-offset_j+offset_k;
      if (nbor_k<=nbor_j)
        nbor_k+=n_stride;
      int nbor_k,k_end;
      if (dev_packed==dev_nbor) {
        nbor_k=nborj_start-offset_j+offset_k;
        int numk = dev_short_nbor[nbor_k-n_stride];
        k_end = nbor_k+fast_mul(numk,n_stride);
      } else {
        nbor_k = nbor_j-offset_j+offset_k;
        if (nbor_k<=nbor_j) nbor_k += n_stride;
        k_end = nbor_end;
      }

      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
        int k=dev_packed[nbor_k];
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (dev_packed==dev_nbor && k <= j) continue;

        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
@@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;

    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor_j];
      nbor_j += n_stride;
      nbor_end = nbor_j+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
      int j=dev_packed[nbor_j];
      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

      // recalculate numk and k_end for the use of short neighbor list
      if (dev_packed==dev_nbor) {
        numk = dev_short_nbor[nbor_k];
        nbor_k += n_stride;
        k_end = nbor_k+fast_mul(numk,n_stride);
      }

      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
        int k=dev_packed[nbor_k];
        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;

    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor_j];
      nbor_j += n_stride;
      nbor_end = nbor_j+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
      int j=dev_packed[nbor_j];
      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

      // recalculate numk and k_end for the use of short neighbor list
      if (dev_packed==dev_nbor) {
        numk = dev_short_nbor[nbor_k];
        nbor_k += n_stride;
        k_end = nbor_k+fast_mul(numk,n_stride);
      }

      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
        int k=dev_packed[nbor_k];
        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
Loading