Merge 'gpu_hip_port' into master (66c5fa2a) · Commits · 郑智淋 / lammps

lib/gpu/Makefile.hip

0 → 100644

+148 −0

Original line number	Diff line number	Diff line
		# /* ----------------------------------------------------------------------
		# Generic Linux Makefile for HIP
		# - export HIP_PLATFORM=hcc (or nvcc) before execution
		# - change HIP_ARCH for your GPU
		# ------------------------------------------------------------------------- */

		# this setting should match LAMMPS Makefile
		# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL

		LMP_INC = -DLAMMPS_SMALLBIG

		# precision for GPU calculations
		# -D_SINGLE_SINGLE # Single precision for all calculations
		# -D_DOUBLE_DOUBLE # Double precision for all calculations
		# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double

		HIP_PRECISION = -D_SINGLE_DOUBLE

		HIP_OPTS = -O3
		HIP_HOST_OPTS = -Wno-deprecated-declarations
		HIP_HOST_INCLUDE =

		# use device sort
		# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
		HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
		# path to cub
		HIP_HOST_INCLUDE += -I./
		# path to hipcub
		HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include

		# use mpi
		HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
		# this settings should match LAMMPS Makefile
		MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
		MPI_LINK_OPTS = $(shell mpicxx --showme:link)
		#MPI_COMP_OPTS += -I/usr/include/mpi -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1

		HIP_PATH ?= $(wildcard /opt/rocm/hip)
		HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)

		ifeq (hcc,$(HIP_PLATFORM))
		HIP_OPTS += -ffast-math
		# possible values: gfx803,gfx900,gfx906
		HIP_ARCH = gfx906
		else ifeq (nvcc,$(HIP_PLATFORM))
		HIP_OPTS += --use_fast_math
		HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
		-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
		-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
		-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
		else
		$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
		endif

		BIN_DIR = .
		OBJ_DIR = ./obj
		LIB_DIR = .
		AR = ar
		BSH = /bin/sh


		# /* ----------------------------------------------------------------------
		# don't change section below without need
		# ------------------------------------------------------------------------- */

		HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
		HIP_GPU_OPTS += $(HIP_OPTS) -I./

		ifeq (hcc,$(HIP_PLATFORM))
		HIP_HOST_OPTS += -fPIC
		HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --genco
		HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
		HIP_GPU_OPTS_E = \"
		HIP_KERNEL_SUFFIX = .cpp
		HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
		export HCC_AMDGPU_TARGET := $(HIP_ARCH)
		else ifeq (nvcc,$(HIP_PLATFORM))
		HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --fatbin
		HIP_GPU_OPTS += $(HIP_ARCH)
		HIP_GPU_SORT_ARCH = $(HIP_ARCH)
		# fix nvcc can't handle -pthread flag
		MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
		MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
		endif

		# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
		HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
		HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
		HIP_HOST_CC_CMD = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)

		# sources

		ALL_H = $(wildcard ./geryon/ucl.h) $(wildcard ./geryon/hip.h) $(wildcard ./lal_*.h)
		SRCS := $(wildcard ./lal_*.cpp)
		OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
		CUS := $(wildcard lal_*.cu)
		CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
		CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))

		all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices

		$(OBJ_DIR):
		mkdir -p $@

		# GPU kernels compilation

		$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu $(ALL_H)
		@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
		$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
		@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
		@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
		@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin

		$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu $(ALL_H)
		@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
		$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
		@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
		@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
		@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin

		$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
		@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
		$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/$.cubin $(OBJ_DIR)/temp_$.cu$(HIP_KERNEL_SUFFIX)
		@xxd -i $(OBJ_DIR)/$*.cubin $@
		@sed -i "s/[a-zA-Z0-9_]$_cubin/$*/g" $@
		@rm $(OBJ_DIR)/temp_$.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$.cubin

		# host sources compilation

		$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
		$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)

		$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
		$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)

		# libgpu building

		$(LIB_DIR)/libgpu.a: $(OBJS)
		$(AR) -crs $@ $(OBJS)
		echo "export HIP_PLATFORM := $(HIP_PLATFORM)\n$(HIP_LIBS_TARGET)" > 'Makefile.lammps'

		# test app building

		$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
		$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)

		clean:
		-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)

lib/gpu/geryon/hip_device.h

0 → 100644

+519 −0

File added.

Preview size limit exceeded, changes collapsed.

lib/gpu/geryon/hip_kernel.h

0 → 100644

+298 −0

Original line number	Diff line number	Diff line
		/* -----------------------------------------------------------------------
		Copyright (2010) Sandia Corporation. Under the terms of Contract
		DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
		certain rights in this software. This software is distributed under
		the Simplified BSD License.
		----------------------------------------------------------------------- */

		#ifndef HIP_KERNEL
		#define HIP_KERNEL


		#include <hip/hip_runtime.h>
		#include "hip_device.h"
		#include <fstream>
		#include <string>
		#include <iostream>

		namespace ucl_hip {

		class UCL_Texture;
		template <class numtyp> class UCL_D_Vec;
		template <class numtyp> class UCL_D_Mat;
		template <class hosttype, class devtype> class UCL_Vector;
		template <class hosttype, class devtype> class UCL_Matrix;
		#define UCL_MAX_KERNEL_ARGS 256

		/// Class storing 1 or more kernel functions from a single string or file
		class UCL_Program {
		UCL_Device* _device_ptr;
		public:
		inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
		inline UCL_Program(UCL_Device &device, const void *program,
		const char flags="", std::string log=NULL) {
		_device_ptr = &device; _cq=device.cq();
		init(device);
		load_string(program,flags,log);
		}

		inline ~UCL_Program() {}

		/// Initialize the program with a device
		inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }

		/// Clear any data associated with program
		/ \note Must call init() after each clear /
		inline void clear() { }

		/// Load a program from a file and compile with flags
		inline int load(const char filename, const char flags="", std::string *log=NULL) {
		std::ifstream in(filename);
		if (!in \|\| in.is_open()==false) {
		#ifndef UCL_NO_EXIT
		std::cerr << "UCL Error: Could not open kernel file: "
		<< filename << std::endl;
		UCL_GERYON_EXIT;
		#endif
		return UCL_FILE_NOT_FOUND;
		}

		std::string program((std::istreambuf_iterator<char>(in)),
		std::istreambuf_iterator<char>());
		in.close();
		return load_string(program.c_str(),flags,log);
		}

		/// Load a program from a string and compile with flags
		inline int load_string(const void program, const char flags="", std::string *log=NULL) {
		return _device_ptr->load_module(program, _module, log);
		}

		friend class UCL_Kernel;
		private:
		hipModule_t _module;
		hipStream_t _cq;
		friend class UCL_Texture;
		};

		/// Class for dealing with CUDA Driver kernels
		class UCL_Kernel {
		public:
		UCL_Kernel() : _dimensions(1), _num_args(0) {
		_num_blocks[0]=0;
		}

		UCL_Kernel(UCL_Program &program, const char *function) :
		_dimensions(1), _num_args(0) {
		_num_blocks[0]=0;
		set_function(program,function);
		_cq=program._cq;
		}

		~UCL_Kernel() {}

		/// Clear any function associated with the kernel
		inline void clear() { }

		/// Get the kernel function from a program
		/ \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) /
		inline int set_function(UCL_Program &program, const char *function) {
		hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
		if (err!=hipSuccess) {
		#ifndef UCL_NO_EXIT
		std::cerr << "UCL Error: Could not find function: " << function
		<< " in program.\n";
		UCL_GERYON_EXIT;
		#endif
		return UCL_FUNCTION_NOT_FOUND;
		}
		_cq=program._cq;
		return UCL_SUCCESS;
		}

		/// Set the kernel argument.
		/** If not a device pointer, this must be repeated each time the argument
		* changes
		* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
		template <class dtype>
		inline void set_arg(const unsigned index, const dtype * const arg) {
		if (index==_num_args)
		add_arg(arg);
		else if (index<_num_args){
		assert(0==1); // not implemented
		}
		else
		assert(0==1); // Must add kernel parameters in sequential order
		}

		/// Set a geryon container as a kernel argument.
		template <class numtyp>
		inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
		{ set_arg(&arg->begin()); }

		/// Set a geryon container as a kernel argument.
		template <class numtyp>
		inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
		{ set_arg(&arg->begin()); }

		/// Set a geryon container as a kernel argument.
		template <class hosttype, class devtype>
		inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
		{ set_arg(&arg->device.begin()); }

		/// Set a geryon container as a kernel argument.
		template <class hosttype, class devtype>
		inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
		{ set_arg(&arg->device.begin()); }

		/// Add a kernel argument.
		inline void add_arg(const hipDeviceptr_t* const arg) {
		add_arg<void>((void*)arg);
		}

		/// Add a kernel argument.
		template <class dtype>
		inline void add_arg(const dtype* const arg) {
		const auto old_size = _hip_kernel_args.size();
		const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
		const auto arg_size = sizeof(dtype);
		_hip_kernel_args.resize(aligned_size + arg_size);
		((dtype)(&_hip_kernel_args[aligned_size])) = *arg;
		_num_args++;
		if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
		}

		/// Add a geryon container as a kernel argument.
		template <class numtyp>
		inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
		{ add_arg(&arg->begin()); }

		/// Add a geryon container as a kernel argument.
		template <class numtyp>
		inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
		{ add_arg(&arg->begin()); }

		/// Add a geryon container as a kernel argument.
		template <class hosttype, class devtype>
		inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
		{ add_arg(&arg->device.begin()); }

		/// Add a geryon container as a kernel argument.
		template <class hosttype, class devtype>
		inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
		{ add_arg(&arg->device.begin()); }

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue is used for the kernel execution **/
		inline void set_size(const size_t num_blocks, const size_t block_size) {
		_dimensions=1;
		_num_blocks[0]=num_blocks;
		_num_blocks[1]=1;
		_num_blocks[2]=1;

		_block_size[0]=block_size;
		_block_size[1]=1;
		_block_size[2]=1;
		}

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue for the kernel is changed to cq **/
		inline void set_size(const size_t num_blocks, const size_t block_size,
		command_queue &cq)
		{ _cq=cq; set_size(num_blocks,block_size); }

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue is used for the kernel execution **/
		inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
		const size_t block_size_x, const size_t block_size_y) {
		_dimensions=2;
		_num_blocks[0]=num_blocks_x;
		_num_blocks[1]=num_blocks_y;
		_num_blocks[2]=1;

		_block_size[0]=block_size_x;
		_block_size[1]=block_size_y;
		_block_size[2]=1;
		}

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue for the kernel is changed to cq **/
		inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
		const size_t block_size_x, const size_t block_size_y,
		command_queue &cq)
		{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue is used for the kernel execution **/
		inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
		const size_t block_size_x,
		const size_t block_size_y, const size_t block_size_z) {
		_dimensions=2;
		_num_blocks[0]=num_blocks_x;
		_num_blocks[1]=num_blocks_y;
		_num_blocks[2]=1;

		_block_size[0]=block_size_x;
		_block_size[1]=block_size_y;
		_block_size[2]=block_size_z;
		}

		/// Set the number of thread blocks and the number of threads in each block
		/** \note This should be called before any arguments have been added
		\note The default command queue is used for the kernel execution **/
		inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
		const size_t block_size_x, const size_t block_size_y,
		const size_t block_size_z, command_queue &cq) {
		_cq=cq;
		set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
		block_size_z);
		}

		/// Run the kernel in the default command queue
		inline void run() {
		size_t args_size = _hip_kernel_args.size();
		void *config[] = {
		HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
		HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
		HIP_LAUNCH_PARAM_END
		};
		const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
		_num_blocks[2],_block_size[0],_block_size[1],
		_block_size[2],0,_cq, NULL, config);
		CU_SAFE_CALL(res);
		//#endif
		}

		/// Clear any arguments associated with the kernel
		inline void clear_args() {
		_num_args=0;
		_hip_kernel_args.clear();
		}

		/// Return the default command queue/stream associated with this data
		inline command_queue & cq() { return _cq; }
		/// Change the default command queue associated with matrix
		inline void cq(command_queue &cq_in) { _cq=cq_in; }
		#include "ucl_arg_kludge.h"

		private:
		hipFunction_t _kernel;
		hipStream_t _cq;
		unsigned _dimensions;
		unsigned _num_blocks[3];
		unsigned _num_args;
		friend class UCL_Texture;

		unsigned _block_size[3];
		std::vector<char> _hip_kernel_args;
		};

		} // namespace

		#endif

lib/gpu/geryon/hip_macros.h

0 → 100644

+83 −0

Original line number	Diff line number	Diff line
		#ifndef HIP_MACROS_H
		#define HIP_MACROS_H

		#include <cstdio>
		#include <cassert>
		#include <hip/hip_runtime.h>

		//#if CUDA_VERSION >= 3020
		#define CUDA_INT_TYPE size_t
		//#else
		//#define CUDA_INT_TYPE unsigned
		//#endif

		#ifdef MPI_GERYON
		#include "mpi.h"
		#define NVD_GERYON_EXIT do { \
		int is_final; \
		MPI_Finalized(&is_final); \
		if (!is_final) \
		MPI_Abort(MPI_COMM_WORLD,-1); \
		} while(0)
		#else
		#define NVD_GERYON_EXIT assert(0==1)
		#endif

		#ifndef UCL_GERYON_EXIT
		#define UCL_GERYON_EXIT NVD_GERYON_EXIT
		#endif

		#ifdef UCL_DEBUG
		#define UCL_SYNC_DEBUG
		#define UCL_DESTRUCT_CHECK
		#endif

		#ifndef UCL_NO_API_CHECK

		#define CU_SAFE_CALL_NS( call ) do { \
		hipError_t err = call; \
		if( hipSuccess != err) { \
		fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n", \
		err, __FILE__, __LINE__ ); \
		NVD_GERYON_EXIT; \
		} } while (0)

		#ifdef UCL_SYNC_DEBUG

		#define CU_SAFE_CALL( call ) do { \
		CU_SAFE_CALL_NS( call ); \
		hipError_t err=hipCtxSynchronize(); \
		if( hipSuccess != err) { \
		fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n", \
		err, __FILE__, __LINE__ ); \
		NVD_GERYON_EXIT; \
		} } while (0)

		#else

		#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )

		#endif

		#else // not DEBUG

		// void macros for performance reasons
		#define CU_SAFE_CALL_NS( call ) call
		#define CU_SAFE_CALL( call) call

		#endif

		#ifdef UCL_DESTRUCT_CHECK

		#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
		#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)

		#else

		#define CU_DESTRUCT_CALL( call) call
		#define CU_DESTRUCT_CALL_NS( call) call

		#endif

		#endif

lib/gpu/geryon/hip_mat.h

0 → 100644

+43 −0

Original line number	Diff line number	Diff line
		/* -----------------------------------------------------------------------
		Copyright (2010) Sandia Corporation. Under the terms of Contract
		DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
		certain rights in this software. This software is distributed under
		the Simplified BSD License.
		----------------------------------------------------------------------- */

		/! \file /

		#ifndef HIP_MAT_H
		#define HIP_MAT_H


		#include <hip/hip_runtime.h>
		#include "hip_memory.h"

		/// Namespace for CUDA Driver routines
		namespace ucl_hip {

		#define _UCL_MAT_ALLOW
		#define _UCL_DEVICE_PTR_MAT
		#include "ucl_basemat.h"
		#include "ucl_h_vec.h"
		#include "ucl_h_mat.h"
		#include "ucl_d_vec.h"
		#include "ucl_d_mat.h"
		#include "ucl_s_obj_help.h"
		#include "ucl_vector.h"
		#include "ucl_matrix.h"
		#undef _UCL_DEVICE_PTR_MAT
		#undef _UCL_MAT_ALLOW

		#define UCL_COPY_ALLOW
		#include "ucl_copy.h"
		#undef UCL_COPY_ALLOW

		#define UCL_PRINT_ALLOW
		#include "ucl_print.h"
		#undef UCL_PRINT_ALLOW

		} // namespace ucl_cudadr

		#endif

Admin message