Merge pull request #597 from ndtrung81/three-body-short-nlist (00aafef1) · Commits · 郑智淋 / lammps

lib/gpu/lal_aux_fun1.h

+12 −12

Original line number	Diff line number	Diff line
		@@ -22,21 +22,21 @@
		offset=tid & (t_per_atom-1); \
		ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

		#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
		i, numj, stride, nbor_end, nbor_begin) \
		i=nbor_mem[ii]; \
		nbor_begin=ii+nbor_stride; \
		numj=nbor_mem[nbor_begin]; \
		if (nbor_mem==packed_mem) { \
		nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
		stride=fast_mul(t_per_atom,nbor_stride); \
		nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
		#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
		i, numj, n_stride, nbor_end, nbor_begin) \
		i=dev_nbor[ii]; \
		nbor_begin=ii+nbor_pitch; \
		numj=dev_nbor[nbor_begin]; \
		if (dev_nbor==dev_packed) { \
		nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
		n_stride=fast_mul(t_per_atom,nbor_pitch); \
		nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
		nbor_begin+=offset; \
		} else { \
		nbor_begin+=nbor_stride; \
		nbor_begin=nbor_mem[nbor_begin]; \
		nbor_begin+=nbor_pitch; \
		nbor_begin=dev_nbor[nbor_begin]; \
		nbor_end=nbor_begin+numj; \
		stride=t_per_atom; \
		n_stride=t_per_atom; \
		nbor_begin+=offset; \
		}

lib/gpu/lal_base_three.cpp

+44 −13

Original line number	Diff line number	Diff line
		@@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
		const int max_nbors, const int maxspecial,
		const double cell_size, const double gpu_split,
		FILE _screen, const void pair_program,
		const char k_two, const char k_three_center,
		const char *k_three_end) {
		const char two, const char three_center,
		const char three_end, const char short_nbor) {
		screen=_screen;

		int gpu_nbor=0;
		@@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
		_gpu_host=1;

		_threads_per_atom=device->threads_per_atom();
		if (_threads_per_atom>1 && gpu_nbor==0) {
		if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
		nbor->packing(true);
		_nbor_data=&(nbor->dev_packed);
		} else
		} else // neigh yes or tpa == 1
		_nbor_data=&(nbor->dev_nbor);
		if (_threads_per_atom*_threads_per_atom>device->warp_size())
		return -10;
		@@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,

		_block_pair=device->pair_block_size();
		_block_size=device->block_ellipse();
		compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
		compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);

		// Initialize host-device load balancer
		hd_balancer.init(device,gpu_nbor,gpu_split);
		@@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
		_max_an_bytes+=ans2->gpu_bytes();
		#endif

		int ef_nall=nall;
		if (ef_nall==0)
		ef_nall=2000;
		dev_short_nbor.alloc(ef_nall(2+max_nbors),(this->ucl_device),UCL_READ_WRITE);

		return 0;
		}

		@@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
		k_three_end.clear();
		k_three_end_vatom.clear();
		k_pair.clear();
		k_short_nbor.clear();
		delete pair_program;
		_compiled=false;
		}
		@@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
		time_pair.clear();
		hd_balancer.clear();

		dev_short_nbor.clear();
		nbor->clear();
		ans->clear();
		#ifdef THREE_CONCURRENT
		@@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
		if (!success)
		return NULL;

		_nall = nall;

		// originally the requirement that nall == nlist was enforced
		// to allow direct indexing neighbors of neighbors after re-arrangement
		// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
		@@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
		return 0;
		atom->cast_copy_x(host_x,host_type);

		_nall = nall;

		int mn;
		nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
		nspecial, special, success, mn);
		@@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
		reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
		if (!success)
		return;
		_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
		}

		atom->cast_x_data(host_x,host_type);
		hd_balancer.start_timer();
		atom->add_x_data(host_x,host_type);

		// re-allocate dev_short_nbor if necessary
		if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
		int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
		dev_short_nbor.resize((2+_max_nbors)*_nmax);
		}

		// _ainum to be used in loop() for short neighbor list build
		_ainum = nlist;

		int evatom=0;
		if (eatom \|\| vatom)
		evatom=1;
		@@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,

		// Build neighbor list on GPU if necessary
		if (ago==0) {
		build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
		_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
		sublo, subhi, tag, nspecial, special, success);
		if (!success)
		return NULL;
		@@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
		*ilist=nbor->host_ilist.begin();
		*jnum=nbor->host_acc.begin();

		// re-allocate dev_short_nbor if necessary
		if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
		int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
		dev_short_nbor.resize((2+_max_nbors)*_nmax);
		}

		// _ainum to be used in loop() for short neighbor list build
		_ainum = nall;

		int evatom=0;
		if (eatom \|\| vatom)
		evatom=1;
		@@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {

		template <class numtyp, class acctyp>
		void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
		const char ktwo, const char kthree_center,
		const char *kthree_end) {
		const char two, const char three_center,
		const char three_end, const char short_nbor) {
		if (_compiled)
		return;

		std::string vatom_name=std::string(kthree_end)+"_vatom";
		std::string vatom_name=std::string(three_end)+"_vatom";

		pair_program=new UCL_Program(dev);
		pair_program->load_string(pair_str,device->compile_string().c_str());
		k_three_center.set_function(*pair_program,kthree_center);
		k_three_end.set_function(*pair_program,kthree_end);
		k_three_center.set_function(*pair_program,three_center);
		k_three_end.set_function(*pair_program,three_end);
		k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
		k_pair.set_function(*pair_program,ktwo);
		k_pair.set_function(*pair_program,two);
		k_short_nbor.set_function(*pair_program,short_nbor);
		pos_tex.get_texture(*pair_program,"pos_tex");

		#ifdef THREE_CONCURRENT

lib/gpu/lal_base_three.h

+13 −16

Original line number	Diff line number	Diff line
		@@ -56,7 +56,8 @@ class BaseThree {
		const int maxspecial, const double cell_size,
		const double gpu_split, FILE *screen,
		const void pair_program, const char k_two,
		const char k_three_center, const char k_three_end);
		const char k_three_center, const char k_three_end,
		const char *k_short_nbor=NULL);

		/// Estimate the overhead for GPU context changes and CPU driver
		void estimate_gpu_overhead();
		@@ -73,18 +74,18 @@ class BaseThree {
		}

		/// Check if there is enough storage for neighbors and realloc if not
		/** \param nlocal number of particles whose nbors must be stored on device
		* \param host_inum number of particles whose nbors need to copied to host
		* \param current maximum number of neighbors
		/** \param inum number of particles whose nbors must be stored on device
		* \param max_nbors maximum number of neighbors
		* \param success set to false if insufficient memory
		* \note olist_size=total number of local particles **/
		inline void resize_local(const int inum, const int max_nbors, bool &success) {
		nbor->resize(inum,max_nbors,success);
		}

		/// Check if there is enough storage for neighbors and realloc if not
		/** \param nlocal number of particles whose nbors must be stored on device
		/** \param inum number of particles whose nbors must be stored on device
		* \param host_inum number of particles whose nbors need to copied to host
		* \param current maximum number of neighbors
		* \param max_nbors current maximum number of neighbors
		* \note host_inum is 0 if the host is performing neighboring
		* \note nlocal+host_inum=total number local particles
		* \note olist_size=0 **/
		@@ -143,14 +144,6 @@ class BaseThree {
		const bool vflag, const bool eatom, const bool vatom,
		int &host_start, const double cpu_time, bool &success);

		/// Pair loop with device neighboring
		int * compute(const int ago, const int inum_full, const int nall,
		double *host_x, int host_type, double *sublo,
		double subhi, tagint tag, int **nspecial,
		tagint **special, const bool eflag, const bool vflag,
		const bool eatom, const bool vatom, int &host_start,
		const double cpu_time, bool &success);

		/// Pair loop with device neighboring
		int ** compute(const int ago, const int inum_full,
		const int nall, double *host_x, int host_type, double *sublo,
		@@ -193,6 +186,9 @@ class BaseThree {
		/// Neighbor data
		Neighbor *nbor;

		UCL_D_Vec<int> dev_short_nbor;
		UCL_Kernel k_short_nbor;

		// ------------------------- DEVICE KERNELS -------------------------
		UCL_Program *pair_program;
		UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
		@@ -207,12 +203,13 @@ class BaseThree {
		int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
		int _gpu_nbor;
		double _max_bytes, _max_an_bytes;
		int _max_nbors, _ainum, _nall;
		double _gpu_overhead, _driver_overhead;
		UCL_D_Vec<int> *_nbor_data;

		void compile_kernels(UCL_Device &dev, const void *pair_string,
		const char k_two, const char k_three_center,
		const char *k_three_end);
		const char two, const char three_center,
		const char three_end, const char short_nbor);

		virtual void loop(const bool _eflag, const bool _vflag,
		const int evatom) = 0;

lib/gpu/lal_sw.cpp

+19 −7

Original line number	Diff line number	Diff line
		@@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
		int success;
		success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
		_screen,sw,"k_sw","k_sw_three_center",
		"k_sw_three_end");
		"k_sw_three_end","k_sw_short_nbor");
		if (success!=0)
		return success;

		@@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
		else
		vflag=0;

		int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
		// build the short neighbor list
		int ainum=this->_ainum;
		int nbor_pitch=this->nbor->nbor_pitch();
		int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
		(BX/this->_threads_per_atom)));
		this->k_short_nbor.set_size(GX,BX);
		this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->dev_short_nbor, &ainum,
		&nbor_pitch, &this->_threads_per_atom);

		// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
		// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
		int ainum=this->ans->inum();
		int nbor_pitch=this->nbor->nbor_pitch();
		ainum=this->ans->inum();
		nbor_pitch=this->nbor->nbor_pitch();
		GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
		(BX/this->_threads_per_atom)));
		this->time_pair.start();

		this->k_pair.set_size(GX,BX);
		this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
		&map, &elem2param, &_nelements,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->dev_short_nbor,
		&this->ans->force, &this->ans->engv,
		&eflag, &vflag, &ainum, &nbor_pitch,
		&this->_threads_per_atom);
		@@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
		this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
		&map, &elem2param, &_nelements,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->dev_short_nbor,
		&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
		&nbor_pitch, &this->_threads_per_atom, &evatom);

		@@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
		this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
		&map, &elem2param, &_nelements,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->nbor->dev_acc,
		&this->nbor->dev_acc, &this->dev_short_nbor,
		&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
		&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

		@@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
		this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
		&map, &elem2param, &_nelements,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->nbor->dev_acc,
		&this->nbor->dev_acc, &this->dev_short_nbor,
		&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
		&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

lib/gpu/lal_sw.cu

+133 −16

Original line number	Diff line number	Diff line
		@@ -130,6 +130,63 @@ texture<int4> sw3_tex;

		#endif

		__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
		const __global numtyp4 *restrict sw3,
		const __global int *restrict map,
		const __global int *restrict elem2param,
		const int nelements,
		const __global int * dev_nbor,
		const __global int * dev_packed,
		__global int * dev_short_nbor,
		const int inum, const int nbor_pitch, const int t_per_atom) {
		__local int n_stride;
		int tid, ii, offset;
		atom_info(t_per_atom,ii,tid,offset);

		if (ii<inum) {
		int nbor, nbor_end;
		int i, numj;
		nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
		n_stride,nbor_end,nbor);

		numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
		int itype=ix.w;
		itype=map[itype];

		int ncount = 0;
		int m = nbor;
		dev_short_nbor[m] = 0;
		int nbor_short = nbor+n_stride;

		for ( ; nbor<nbor_end; nbor+=n_stride) {

		int j=dev_packed[nbor];
		int nj = j;
		j &= NEIGHMASK;

		numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
		int jtype=jx.w;
		jtype=map[jtype];
		int ijparam=elem2param[itypenelementsnelements+jtype*nelements+jtype];

		// Compute r12
		numtyp delx = ix.x-jx.x;
		numtyp dely = ix.y-jx.y;
		numtyp delz = ix.z-jx.z;
		numtyp rsq = delxdelx+delydely+delz*delz;

		if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
		dev_short_nbor[nbor_short] = nj;
		nbor_short += n_stride;
		ncount++;
		}
		} // for nbor

		// store the number of neighbors for each thread
		dev_short_nbor[m] = ncount;

		} // if ii
		}

		__kernel void k_sw(const __global numtyp4 *restrict x_,
		const __global numtyp4 *restrict sw1,
		@@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
		const int nelements,
		const __global int * dev_nbor,
		const __global int * dev_packed,
		const __global int * dev_short_nbor,
		__global acctyp4 *restrict ans,
		__global acctyp *restrict engv,
		const int eflag, const int vflag, const int inum,
		@@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
		__syncthreads();

		if (ii<inum) {
		int nbor, nbor_end;
		int i, numj;
		int nbor, nbor_end, i, numj;
		const int* nbor_mem = dev_packed;
		nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
		n_stride,nbor_end,nbor);

		@@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
		int itype=ix.w;
		itype=map[itype];

		// recalculate numj and nbor_end for use of the short nbor list
		if (dev_packed==dev_nbor) {
		numj = dev_short_nbor[nbor];
		nbor += n_stride;
		nbor_end = nbor+fast_mul(numj,n_stride);
		nbor_mem = dev_short_nbor;
		}

		for ( ; nbor<nbor_end; nbor+=n_stride) {

		int j=dev_packed[nbor];
		int j=nbor_mem[nbor];
		j &= NEIGHMASK;

		numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
		@@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
		const int nelements,
		const __global int * dev_nbor,
		const __global int * dev_packed,
		const __global int * dev_short_nbor,
		__global acctyp4 *restrict ans,
		__global acctyp *restrict engv,
		const int eflag, const int vflag,
		@@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,

		if (ii<inum) {
		int i, numj, nbor_j, nbor_end;

		const int* nbor_mem = dev_packed;
		int offset_j=offset/t_per_atom;
		nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
		n_stride,nbor_end,nbor_j);
		@@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
		int itype=ix.w;
		itype=map[itype];

		// recalculate numj and nbor_end for use of the short nbor list
		if (dev_packed==dev_nbor) {
		numj = dev_short_nbor[nbor_j];
		nbor_j += n_stride;
		nbor_end = nbor_j+fast_mul(numj,n_stride);
		nbor_mem = dev_short_nbor;
		}
		int nborj_start = nbor_j;

		for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

		int j=dev_packed[nbor_j];
		int j=nbor_mem[nbor_j];
		j &= NEIGHMASK;

		numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
		@@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
		sw_sigma_gamma_ij=sw1_ijparam.ysw1_ijparam.w; //sw_sigmasw_gamma;
		sw_cut_ij=sw3_ijparam.x;

		int nbor_k=nbor_j-offset_j+offset_k;
		if (nbor_k<=nbor_j)
		nbor_k+=n_stride;
		int nbor_k,k_end;
		if (dev_packed==dev_nbor) {
		nbor_k=nborj_start-offset_j+offset_k;
		int numk = dev_short_nbor[nbor_k-n_stride];
		k_end = nbor_k+fast_mul(numk,n_stride);
		} else {
		nbor_k = nbor_j-offset_j+offset_k;
		if (nbor_k<=nbor_j) nbor_k += n_stride;
		k_end = nbor_end;
		}

		for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
		int k=dev_packed[nbor_k];
		for ( ; nbor_k<k_end; nbor_k+=n_stride) {
		int k=nbor_mem[nbor_k];
		k &= NEIGHMASK;

		if (dev_packed==dev_nbor && k <= j) continue;

		numtyp4 kx; fetch4(kx,k,pos_tex);
		int ktype=kx.w;
		ktype=map[ktype];
		@@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
		const __global int * dev_nbor,
		const __global int * dev_packed,
		const __global int * dev_acc,
		const __global int * dev_short_nbor,
		__global acctyp4 *restrict ans,
		__global acctyp *restrict engv,
		const int eflag, const int vflag,
		@@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

		if (ii<inum) {
		int i, numj, nbor_j, nbor_end, k_end;

		const int* nbor_mem = dev_packed;
		int offset_j=offset/t_per_atom;
		nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
		n_stride,nbor_end,nbor_j);
		@@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
		int itype=ix.w;
		itype=map[itype];

		// recalculate numj and nbor_end for use of the short nbor list
		if (dev_packed==dev_nbor) {
		numj = dev_short_nbor[nbor_j];
		nbor_j += n_stride;
		nbor_end = nbor_j+fast_mul(numj,n_stride);
		nbor_mem = dev_short_nbor;
		}

		for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
		int j=dev_packed[nbor_j];
		int j=nbor_mem[nbor_j];
		j &= NEIGHMASK;

		numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
		@@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
		nbor_k+=offset_k;
		}

		// recalculate numk and k_end for the use of short neighbor list
		if (dev_packed==dev_nbor) {
		numk = dev_short_nbor[nbor_k];
		nbor_k += n_stride;
		k_end = nbor_k+fast_mul(numk,n_stride);
		}

		for ( ; nbor_k<k_end; nbor_k+=n_stride) {
		int k=dev_packed[nbor_k];
		int k=nbor_mem[nbor_k];
		k &= NEIGHMASK;

		if (k == i) continue;
		@@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
		const __global int * dev_nbor,
		const __global int * dev_packed,
		const __global int * dev_acc,
		const __global int * dev_short_nbor,
		__global acctyp4 *restrict ans,
		__global acctyp *restrict engv,
		const int eflag, const int vflag,
		@@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

		if (ii<inum) {
		int i, numj, nbor_j, nbor_end, k_end;

		const int* nbor_mem = dev_packed;
		int offset_j=offset/t_per_atom;
		nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
		n_stride,nbor_end,nbor_j);
		@@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
		int itype=ix.w;
		itype=map[itype];

		// recalculate numj and nbor_end for use of the short nbor list
		if (dev_packed==dev_nbor) {
		numj = dev_short_nbor[nbor_j];
		nbor_j += n_stride;
		nbor_end = nbor_j+fast_mul(numj,n_stride);
		nbor_mem = dev_short_nbor;
		}

		for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
		int j=dev_packed[nbor_j];
		int j=nbor_mem[nbor_j];
		j &= NEIGHMASK;

		numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
		@@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
		nbor_k+=offset_k;
		}

		// recalculate numk and k_end for the use of short neighbor list
		if (dev_packed==dev_nbor) {
		numk = dev_short_nbor[nbor_k];
		nbor_k += n_stride;
		k_end = nbor_k+fast_mul(numk,n_stride);
		}

		for ( ; nbor_k<k_end; nbor_k+=n_stride) {
		int k=dev_packed[nbor_k];
		int k=nbor_mem[nbor_k];
		k &= NEIGHMASK;

		if (k == i) continue;

Admin message