Implement GPU pair style lj/cut/tip4p/long/gpu (64bdc596) · Commits · 郑智淋 / lammps

lib/gpu/Nvidia.makefile

+16 −2

Original line number	Diff line number	Diff line
		@@ -82,7 +82,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
		$(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o \
		$(OBJ_DIR)/lal_coul_long_cs.o $(OBJ_DIR)/lal_coul_long_cs_ext.o \
		$(OBJ_DIR)/lal_born_coul_long_cs.o $(OBJ_DIR)/lal_born_coul_long_cs_ext.o \
		$(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o
		$(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o \
		$(OBJ_DIR)/lal_lj_tip4p_long.o $(OBJ_DIR)/lal_lj_tip4p_long_ext.o

		CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
		$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
		@@ -143,7 +144,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
		$(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long_cubin.h \
		$(OBJ_DIR)/coul_long_cs.cubin $(OBJ_DIR)/coul_long_cs_cubin.h \
		$(OBJ_DIR)/born_coul_long_cs.cubin $(OBJ_DIR)/born_coul_long_cs_cubin.h \
		$(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs_cubin.h
		$(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs_cubin.h \
		$(OBJ_DIR)/lj_tip4p_long.cubin $(OBJ_DIR)/lj_tip4p_long_cubin.h

		all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)

		@@ -297,6 +299,18 @@ $(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DI
		$(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
		$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lj_tip4p_long.cubin: lal_lj_tip4p_long.cu lal_precision.h lal_preprocessor.h
		$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_tip4p_long.cu

		$(OBJ_DIR)/lj_tip4p_long_cubin.h: $(OBJ_DIR)/lj_tip4p_long.cubin $(OBJ_DIR)/lj_tip4p_long.cubin
		$(BIN2C) -c -n lj_tip4p_long $(OBJ_DIR)/lj_tip4p_long.cubin > $(OBJ_DIR)/lj_tip4p_long_cubin.h

		$(OBJ_DIR)/lal_lj_tip4p_long.o: $(ALL_H) lal_lj_tip4p_long.h lal_lj_tip4p_long.cpp $(OBJ_DIR)/lj_tip4p_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
		$(CUDR) -o $@ -c lal_lj_tip4p_long.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lal_lj_tip4p_long_ext.o: $(ALL_H) lal_lj_tip4p_long.h lal_lj_tip4p_long_ext.cpp lal_base_atomic.h
		$(CUDR) -o $@ -c lal_lj_tip4p_long_ext.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
		$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu

lib/gpu/Opencl.makefile

+13 −2

Original line number	Diff line number	Diff line
		@@ -71,7 +71,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
		$(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o \
		$(OBJ_DIR)/lal_coul_long_cs.o $(OBJ_DIR)/lal_coul_long_cs_ext.o \
		$(OBJ_DIR)/lal_born_coul_long_cs.o $(OBJ_DIR)/lal_born_coul_long_cs_ext.o \
		$(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o
		$(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o \
		$(OBJ_DIR)/lal_lj_tip4p_long.o $(OBJ_DIR)/lal_lj_tip4p_long_ext.o

		KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
		$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
		@@ -102,7 +103,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
		$(OBJ_DIR)/lj_cubic_cl.h $(OBJ_DIR)/vashishta_cl.h \
		$(OBJ_DIR)/ufm_cl.h $(OBJ_DIR)/dipole_long_lj_cl.h \
		$(OBJ_DIR)/lj_expand_coul_long_cl.h $(OBJ_DIR)/coul_long_cs_cl.h \
		$(OBJ_DIR)/born_coul_long_cs_cl.h $(OBJ_DIR)/born_coul_wolf_cs_cl.h
		$(OBJ_DIR)/born_coul_long_cs_cl.h $(OBJ_DIR)/born_coul_wolf_cs_cl.h \
		$(OBJ_DIR)/lj_tip4p_long_cl.h


		OCL_EXECS = $(BIN_DIR)/ocl_get_devices
		@@ -202,6 +204,15 @@ $(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)
		$(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
		$(OCL) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lj_tip4p_long_cl.h: lal_lj_tip4p_long.cu $(PRE1_H)
		$(BSH) ./geryon/file_to_cstr.sh lj_tip4p_long $(PRE1_H) lal_lj_tip4p_long.cu $(OBJ_DIR)/lj_tip4p_long_cl.h;

		$(OBJ_DIR)/lal_lj_tip4p_long.o: $(ALL_H) lal_lj_tip4p_long.h lal_lj_tip4p_long.cpp $(OBJ_DIR)/lj_tip4p_long_cl.h $(OBJ_DIR)/lj_tip4p_long_cl.h $(OBJ_DIR)/lal_base_atomic.o
		$(OCL) -o $@ -c lal_lj_tip4p_long.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lal_lj_tip4p_long_ext.o: $(ALL_H) lal_lj_tip4p_long.h lal_lj_tip4p_long_ext.cpp lal_base_atomic.h
		$(OCL) -o $@ -c lal_lj_tip4p_long_ext.cpp -I$(OBJ_DIR)

		$(OBJ_DIR)/lj_coul_cl.h: lal_lj_coul.cu $(PRE1_H)
		$(BSH) ./geryon/file_to_cstr.sh lj_coul $(PRE1_H) lal_lj_coul.cu $(OBJ_DIR)/lj_coul_cl.h;

lib/gpu/lal_lj_tip4p_long.cpp

0 → 100644

+241 −0

Original line number	Diff line number	Diff line
		#if defined(USE_OPENCL)
		#include "lj_tip4p_long_cl.h"
		#elif defined(USE_CUDART)
		const char *lj_tip4p=0;
		#else
		#include "lj_tip4p_long_cubin.h"
		#endif

		#include "lal_lj_tip4p_long.h"
		#include <cassert>
		using namespace LAMMPS_AL;
		#define LJ_TIP4PLong_T LJ_TIP4PLong<numtyp, acctyp>

		extern Device<PRECISION,ACC_PRECISION> device;

		template <class numtyp, class acctyp>
		LJ_TIP4PLong<numtyp, acctyp>::LJ_TIP4PLong(): BaseCharge<numtyp,acctyp>(), _allocated(false) {
		}

		template <class numtyp, class acctyp>
		LJ_TIP4PLong<numtyp, acctyp>::~LJ_TIP4PLong() {
		clear();
		}

		template <class numtyp, class acctyp>
		int LJ_TIP4PLong<numtyp, acctyp>::bytes_per_atom(const int max_nbors) const {
		return this->bytes_per_atom_atomic(max_nbors);
		}

		template <class numtyp, class acctyp>
		int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
		double host_cutsq, double host_lj1,
		double host_lj2, double host_lj3,
		double host_lj4, double host_offset,
		double *host_special_lj, const int nlocal,
		const int tH, const int tO,
		const double a, const double qd,
		const int nall, const int max_nbors,
		const int maxspecial, const double cell_size,
		const double gpu_split, FILE *_screen,
		double **host_cut_ljsq,
		const double host_cut_coulsq, const double host_cut_coulsqplus,
		double *host_special_coul, const double qqrd2e,
		const double g_ewald, int* tag,
		int *map_array, int map_size,
		int *sametag, int max_same) {
		int success;
		success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
		_screen,lj_tip4p_long,"k_lj_tip4p_long");
		if (success!=0)
		return success;
		k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");

		TypeH = tH;
		TypeO = tO;
		alpha = a;
		qdist = qd;

		// If atom type constants fit in shared memory use fast kernel
		int lj_types=ntypes;
		shared_types=false;
		// int max_shared_types=this->device->max_shared_types();
		// if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
		// lj_types=max_shared_types;
		// shared_types=true;
		// }
		_lj_types=lj_types;

		// Allocate a host write buffer for data initialization
		UCL_H_Vec<numtyp> host_write(lj_typeslj_types32,*(this->ucl_device),
		UCL_WRITE_ONLY);

		for (int i=0; i<lj_types*lj_types; i++)
		host_write[i]=0.0;

		lj1.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);
		this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
		host_cut_ljsq);

		lj3.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);
		this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
		host_offset);

		cutsq.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);
		this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);

		sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
		for (int i=0; i<4; i++) {
		host_write[i]=host_special_lj[i];
		host_write[i+4]=host_special_coul[i];
		}
		ucl_copy(sp_lj,host_write,8,false);

		force_comp.alloc(7272, (this->ucl_device), UCL_READ_WRITE);

		_qqrd2e=qqrd2e;
		_g_ewald=g_ewald;
		cut_coulsq = host_cut_coulsq;
		cut_coulsqplus = host_cut_coulsqplus;

		hneight.alloc(nall4,(this->ucl_device), UCL_READ_WRITE);
		m.alloc(nall,*(this->ucl_device), UCL_READ_WRITE);
		ansO.alloc(nall,*(this->ucl_device), UCL_READ_WRITE);

		// Allocate a host write buffer for data initialization
		UCL_H_Vec<int> host_tag_write(nall,*(this->ucl_device),UCL_WRITE_ONLY);
		this->tag.alloc(nall,*(this->ucl_device), UCL_READ_WRITE);
		for(int i=0; i<nall; ++i) host_tag_write[i] = tag[i];
		ucl_copy(this->tag, host_tag_write, nall, false);

		//if(max_same>host_tag_write.cols()) host_tag_write.resize(max_same);
		this->atom_sametag.alloc(nall, *(this->ucl_device), UCL_READ_WRITE);
		for(int i=0; i<nall; ++i) host_tag_write[i] = sametag[i];
		ucl_copy(this->atom_sametag, host_tag_write, nall, false);

		if(map_size>host_tag_write.cols()) host_tag_write.resize(map_size);
		this->map_array.alloc(map_size,*(this->ucl_device), UCL_READ_WRITE);
		for(int i=0; i<map_size; ++i) host_tag_write[i] = map_array[i];
		ucl_copy(this->map_array, host_tag_write, map_size, false);

		_allocated=true;
		this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
		sp_lj.row_bytes() + hneight.row_bytes()+m.row_bytes()+
		this->tag.row_bytes()+this->atom_sametag.row_bytes() +
		this->map_array.row_bytes();
		return 0;
		}


		template <class numtyp, class acctyp>
		void LJ_TIP4PLong<numtyp, acctyp>::clear() {
		if (!_allocated)
		return;
		_allocated=false;

		lj1.clear();
		lj3.clear();
		sp_lj.clear();
		cutsq.clear();
		hneight.clear();
		m.clear();
		tag.clear();
		atom_sametag.clear();
		map_array.clear();
		ansO.clear();
		force_comp.clear();

		k_pair_distrib.clear();

		this->clear_atomic();
		}

		template <class numtyp, class acctyp>
		double LJ_TIP4PLong<numtyp, acctyp>::host_memory_usage() const {
		return this->host_memory_usage_atomic()+sizeof(LJ_TIP4PLong<numtyp,acctyp>);
		}

		// ---------------------------------------------------------------------------
		// Calculate energies, forces, and torques
		// ---------------------------------------------------------------------------
		template <class numtyp, class acctyp>
		void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
		// Compute the block size and grid size to keep all cores busy
		const int BX=this->block_size();
		int eflag, vflag;
		if (_eflag)
		eflag=1;
		else
		eflag=0;

		if (_vflag)
		vflag=1;
		else
		vflag=0;

		int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
		(BX/this->_threads_per_atom)));

		int ainum=this->ans->inum();
		int nbor_pitch=this->nbor->nbor_pitch();
		this->time_pair.start();

		this->k_pair.set_size(GX,BX);
		if (vflag){
		this->ansO.resize(ainum*3);
		} else {
		this->ansO.resize(ainum);
		}
		this->ansO.zero();
		this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
		&this->nbor->dev_nbor, &this->_nbor_data->begin(),
		&this->ans->force, &this->ans->engv, &eflag, &vflag,
		&ainum, &nbor_pitch, &this->_threads_per_atom,
		&hneight, &m, &TypeO, &TypeH, &alpha,
		&this->atom->q, &cutsq, &_qqrd2e, &_g_ewald,
		&cut_coulsq, &cut_coulsqplus, &tag, &map_array,
		&atom_sametag, &this->ansO);
		GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
		this->k_pair_distrib.set_size(GX,BX);
		this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv, &eflag, &vflag,
		&ainum, &nbor_pitch, &this->_threads_per_atom,
		&hneight, &m, &TypeO, &TypeH, &alpha,
		&this->atom->q, &this->ansO);
		this->time_pair.stop();
		}


		template <class numtyp, class acctyp>
		void LJ_TIP4PLong<numtyp, acctyp>::copy_relations_data(int hn, double newsite, int n,
		int* tag, int map_array, int map_size, int sametag, int max_same, int ago){
		int nall = n;
		const int hn_sz = n4; // matrix size = col size col number
		if(hn_sz > hneight.cols()){
		hneight.resize(hn_sz+1);
		}
		if (ago == 0)
		hneight.zero();
		if(n > m.cols()){
		m.resize(n+1);
		}
		m.zero();

		UCL_H_Vec<int> host_tag_write(nall,*(this->ucl_device),UCL_WRITE_ONLY);
		if(this->tag.cols() < nall) this->tag.resize(nall);
		for(int i=0; i<nall; ++i) host_tag_write[i] = tag[i];
		ucl_copy(this->tag, host_tag_write, nall, false);

		if(max_same>host_tag_write.cols()) host_tag_write.resize(max_same);
		if(this->atom_sametag.cols() < nall) this->atom_sametag.resize(nall);
		for(int i=0; i<nall; ++i) host_tag_write[i] = sametag[i];
		ucl_copy(this->atom_sametag, host_tag_write, nall, false);

		if(map_size>host_tag_write.cols()) host_tag_write.resize(map_size);
		if(this->map_array.cols() < map_size) this->map_array.resize(map_size);
		for(int i=0; i<map_size; ++i) host_tag_write[i] = map_array[i];
		ucl_copy(this->map_array, host_tag_write, map_size, false);

		host_tag_write.clear();
		}

		template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;

lib/gpu/lal_lj_tip4p_long.cu

0 → 100644

+518 −0

File added.

Preview size limit exceeded, changes collapsed.

lib/gpu/lal_lj_tip4p_long.h

0 → 100644

+95 −0

Original line number	Diff line number	Diff line
		#ifndef LAL_LJ_TIP4P_LONG_H
		#define LAL_LJ_TIP4P_LONG_H

		#include "lal_base_charge.h"

		namespace LAMMPS_AL {

		template <class numtyp, class acctyp>
		class LJ_TIP4PLong : public BaseCharge<numtyp, acctyp> {
		public:
		LJ_TIP4PLong();
		~LJ_TIP4PLong();

		/// Clear any previous data and set up for a new LAMMPS run
		/** \param max_nbors initial number of rows in the neighbor matrix
		* \param cell_size cutoff + skin
		* \param gpu_split fraction of particles handled by device
		*
		* Returns:
		* - 0 if successfull
		* - -1 if fix gpu not found
		* - -3 if there is an out of memory error
		* - -4 if the GPU library was not compiled for GPU
		* - -5 Double precision is not supported on card **/
		int init(const int ntypes, double **host_cutsq,
		double host_lj1, double host_lj2, double **host_lj3,
		double host_lj4, double host_offset, double *host_special_lj,
		const int nlocal, const int tH, const int tO,
		const double alpha, const double qdist,
		const int nall, const int max_nbors,
		const int maxspecial, const double cell_size,
		const double gpu_split, FILE *screen,
		double **host_cut_ljsq,
		const double host_cut_coulsq, const double host_cut_coulsqplus,
		double *host_special_coul, const double qqrd2e,
		const double g_ewald, int* tag,
		int *map_array, int map_size,
		int *sametag, int max_same);

		/// Clear all host and device data
		/ \note This is called at the beginning of the init() routine /
		void clear();

		/// Returns memory usage on device per atom
		int bytes_per_atom(const int max_nbors) const;

		/// Total host memory used by library for pair style
		double host_memory_usage() const;

		/// Copy data which is easier to compute in LAMMPS_NS
		void copy_relations_data(int hn, double m, int n,int* tag, int *map_array, int map_size,
		int *sametag, int max_same, int ago);

		// --------------------------- TYPE DATA --------------------------

		/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw
		UCL_D_Vec<numtyp4> lj1;
		/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
		UCL_D_Vec<numtyp4> lj3;
		/// cutsq
		UCL_D_Vec<numtyp> cutsq;
		/// Special LJ values [0-3] and Special Coul values [4-7]
		UCL_D_Vec<numtyp> sp_lj;

		/// If atom type constants fit in shared memory, use fast kernels
		bool shared_types;

		/// Number of atom types
		int _lj_types;

		numtyp _qqrd2e, _g_ewald;
		/// TIP4P water parameters
		int TypeO, TypeH;
		numtyp alpha, qdist;
		numtyp cut_coulsq, cut_coulsqplus;

		UCL_D_Vec<int> hneight;
		UCL_D_Vec<numtyp4> m; // position and charge of virtual particle
		UCL_D_Vec<acctyp4> ansO; // force applied to virtual particle
		UCL_D_Vec<acctyp4> force_comp;

		UCL_D_Vec<int> tag;
		UCL_D_Vec<int> map_array;
		UCL_D_Vec<int> atom_sametag;

		UCL_Kernel k_pair_distrib;

		private:
		bool _allocated;
		void loop(const bool _eflag, const bool _vflag);
		};

		}

		#endif

Admin message