Commit 66c5fa2a authored by Vsevak's avatar Vsevak
Browse files

Merge 'gpu_hip_port' into master

parent 5eef3b18
Loading
Loading
Loading
Loading

lib/gpu/Makefile.hip

0 → 100644
+148 −0
Original line number Diff line number Diff line
# /* ----------------------------------------------------------------------   
#  Generic Linux Makefile for HIP
#     - export HIP_PLATFORM=hcc (or nvcc) before execution
#     - change HIP_ARCH for your GPU
# ------------------------------------------------------------------------- */

# this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL

LMP_INC = -DLAMMPS_SMALLBIG

# precision for GPU calculations
# -D_SINGLE_SINGLE  # Single precision for all calculations
# -D_DOUBLE_DOUBLE  # Double precision for all calculations
# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double

HIP_PRECISION = -D_SINGLE_DOUBLE

HIP_OPTS = -O3 
HIP_HOST_OPTS = -Wno-deprecated-declarations
HIP_HOST_INCLUDE = 

# use device sort 
# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT 
# path to cub
HIP_HOST_INCLUDE += -I./
# path to hipcub
HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include

# use mpi
HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
# this settings should match LAMMPS Makefile
MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
MPI_LINK_OPTS = $(shell mpicxx --showme:link)
#MPI_COMP_OPTS += -I/usr/include/mpi  -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1

HIP_PATH ?= $(wildcard /opt/rocm/hip)
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)

ifeq (hcc,$(HIP_PLATFORM))
	HIP_OPTS  += -ffast-math
	# possible values: gfx803,gfx900,gfx906
	HIP_ARCH = gfx906
else ifeq (nvcc,$(HIP_PLATFORM))
	HIP_OPTS  += --use_fast_math
	HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
		    -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
			-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
			-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75] 
else
	$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
endif

BIN_DIR = .
OBJ_DIR = ./obj
LIB_DIR = .
AR = ar
BSH = /bin/sh


# /* ----------------------------------------------------------------------   
#  				don't change section below without need			
# ------------------------------------------------------------------------- */

HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
HIP_GPU_OPTS += $(HIP_OPTS) -I./

ifeq (hcc,$(HIP_PLATFORM))
	HIP_HOST_OPTS += -fPIC
	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --genco
	HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\" 
	HIP_GPU_OPTS_E = \"
	HIP_KERNEL_SUFFIX = .cpp
	HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
	export HCC_AMDGPU_TARGET := $(HIP_ARCH)
else ifeq (nvcc,$(HIP_PLATFORM))
	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --fatbin 
	HIP_GPU_OPTS += $(HIP_ARCH)
	HIP_GPU_SORT_ARCH = $(HIP_ARCH)
	# fix nvcc can't handle -pthread flag
	MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
	MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
endif

# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
HIP_HOST_CC_CMD  = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)

# sources

ALL_H  =  $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
SRCS := $(wildcard ./lal_*.cpp)
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
CUS  := $(wildcard lal_*.cu)
CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))

all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices

$(OBJ_DIR):
	mkdir -p $@    

# GPU kernels compilation 

$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu  $(ALL_H)
	@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float  -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
	@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
	@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
	@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin

$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu  $(ALL_H)
	@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
	@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
	@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
	@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin

$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
	@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
	@xxd -i $(OBJ_DIR)/$*.cubin $@
	@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
	@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin

# host sources compilation

$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH) 

$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)

# libgpu building

$(LIB_DIR)/libgpu.a: $(OBJS)
	$(AR) -crs $@ $(OBJS)
	echo "export HIP_PLATFORM := $(HIP_PLATFORM)\n$(HIP_LIBS_TARGET)" > 'Makefile.lammps'   

# test app building

$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
	$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)

clean:
	-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)
+519 −0

File added.

Preview size limit exceeded, changes collapsed.

+298 −0
Original line number Diff line number Diff line
/* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

#ifndef HIP_KERNEL
#define HIP_KERNEL


#include <hip/hip_runtime.h>
#include "hip_device.h"
#include <fstream>
#include <string>
#include <iostream>

namespace ucl_hip {

class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256

/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
  UCL_Device* _device_ptr;
 public:
  inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
  inline UCL_Program(UCL_Device &device, const void *program,
                     const char *flags="", std::string *log=NULL) {
    _device_ptr = &device; _cq=device.cq();
    init(device);
    load_string(program,flags,log);
  }

  inline ~UCL_Program() {}

  /// Initialize the program with a device
  inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }

  /// Clear any data associated with program
  /** \note Must call init() after each clear **/
  inline void clear() { }

  /// Load a program from a file and compile with flags
  inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
    std::ifstream in(filename);
    if (!in || in.is_open()==false) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not open kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
      return UCL_FILE_NOT_FOUND;
    }

    std::string program((std::istreambuf_iterator<char>(in)),
                        std::istreambuf_iterator<char>());
    in.close();
    return load_string(program.c_str(),flags,log);
  }

  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
    return _device_ptr->load_module(program, _module, log);
  }

  friend class UCL_Kernel;
 private:
  hipModule_t _module;
  hipStream_t _cq;
  friend class UCL_Texture;
};

/// Class for dealing with CUDA Driver kernels
class UCL_Kernel {
 public:
  UCL_Kernel() : _dimensions(1), _num_args(0) {
    _num_blocks[0]=0;
  }

  UCL_Kernel(UCL_Program &program, const char *function) :
    _dimensions(1), _num_args(0) {
    _num_blocks[0]=0;
    set_function(program,function);
    _cq=program._cq;
  }

  ~UCL_Kernel() {}

  /// Clear any function associated with the kernel
  inline void clear() { }

  /// Get the kernel function from a program
  /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
  inline int set_function(UCL_Program &program, const char *function) {
    hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
    if (err!=hipSuccess) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not find function: " << function
                << " in program.\n";
      UCL_GERYON_EXIT;
      #endif
      return UCL_FUNCTION_NOT_FOUND;
    }
    _cq=program._cq;
    return UCL_SUCCESS;
  }

  /// Set the kernel argument.
  /** If not a device pointer, this must be repeated each time the argument
    * changes
    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
  template <class dtype>
  inline void set_arg(const unsigned index, const dtype * const arg) {
    if (index==_num_args)
      add_arg(arg);
    else if (index<_num_args){
      assert(0==1); // not implemented
    }
    else
      assert(0==1); // Must add kernel parameters in sequential order
  }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Add a kernel argument.
  inline void add_arg(const hipDeviceptr_t* const arg) {
    add_arg<void*>((void**)arg);
  }

  /// Add a kernel argument.
  template <class dtype>
  inline void add_arg(const dtype* const arg) {
    const auto old_size = _hip_kernel_args.size();
    const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
    const auto arg_size = sizeof(dtype);
    _hip_kernel_args.resize(aligned_size + arg_size);
    *((dtype*)(&_hip_kernel_args[aligned_size])) = *arg; 
    _num_args++;
    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
  }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks, const size_t block_size) {
    _dimensions=1;
    _num_blocks[0]=num_blocks;
    _num_blocks[1]=1;
    _num_blocks[2]=1;
    
    _block_size[0]=block_size;
    _block_size[1]=1;
    _block_size[2]=1;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks, const size_t block_size,
                       command_queue &cq)
    { _cq=cq; set_size(num_blocks,block_size); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y) {
    _dimensions=2;
    _num_blocks[0]=num_blocks_x;
    _num_blocks[1]=num_blocks_y;
    _num_blocks[2]=1;

    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=1;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
                       command_queue &cq)
    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x,
                       const size_t block_size_y, const size_t block_size_z) {
    _dimensions=2;
    _num_blocks[0]=num_blocks_x;
    _num_blocks[1]=num_blocks_y;
    _num_blocks[2]=1;

    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=block_size_z;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
                       const size_t block_size_z, command_queue &cq) {
    _cq=cq;
    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
             block_size_z);
  }

  /// Run the kernel in the default command queue
  inline void run() {
    size_t args_size = _hip_kernel_args.size();
    void *config[] = {
            HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
            HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
            HIP_LAUNCH_PARAM_END
    };
    const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
                                _num_blocks[2],_block_size[0],_block_size[1],
                                _block_size[2],0,_cq, NULL, config);
    CU_SAFE_CALL(res);
//#endif
  }

  /// Clear any arguments associated with the kernel
  inline void clear_args() {
    _num_args=0;
    _hip_kernel_args.clear();
  }

  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return _cq; }
  /// Change the default command queue associated with matrix
  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  #include "ucl_arg_kludge.h"

 private:
  hipFunction_t _kernel;
  hipStream_t _cq;
  unsigned _dimensions;
  unsigned _num_blocks[3];
  unsigned _num_args;
  friend class UCL_Texture;

  unsigned _block_size[3];
  std::vector<char> _hip_kernel_args;
};

} // namespace

#endif
+83 −0
Original line number Diff line number Diff line
#ifndef HIP_MACROS_H
#define HIP_MACROS_H

#include <cstdio>
#include <cassert>
#include <hip/hip_runtime.h>

//#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
//#else
//#define CUDA_INT_TYPE unsigned
//#endif

#ifdef MPI_GERYON
#include "mpi.h"
#define NVD_GERYON_EXIT do {                                               \
  int is_final;                                                            \
  MPI_Finalized(&is_final);                                                \
  if (!is_final)                                                           \
    MPI_Abort(MPI_COMM_WORLD,-1);                                          \
  } while(0)
#else
#define NVD_GERYON_EXIT assert(0==1)
#endif

#ifndef UCL_GERYON_EXIT
#define UCL_GERYON_EXIT NVD_GERYON_EXIT
#endif

#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#endif

#ifndef UCL_NO_API_CHECK

#define CU_SAFE_CALL_NS( call ) do {                                         \
    hipError_t err = call;                                                     \
    if( hipSuccess != err) {                                               \
        fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n",   \
                err, __FILE__, __LINE__ );                                   \
        NVD_GERYON_EXIT;                                                     \
    } } while (0)

#ifdef UCL_SYNC_DEBUG

#define CU_SAFE_CALL( call ) do {                                            \
    CU_SAFE_CALL_NS( call );                                                 \
    hipError_t err=hipCtxSynchronize();                                                  \
    if( hipSuccess != err) {                                               \
        fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n",   \
                err, __FILE__, __LINE__ );                                   \
        NVD_GERYON_EXIT;                                                     \
    } } while (0)

#else

#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )

#endif

#else  // not DEBUG

// void macros for performance reasons
#define CU_SAFE_CALL_NS( call ) call
#define CU_SAFE_CALL( call) call

#endif

#ifdef UCL_DESTRUCT_CHECK

#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)

#else

#define CU_DESTRUCT_CALL( call) call
#define CU_DESTRUCT_CALL_NS( call) call

#endif

#endif
+43 −0
Original line number Diff line number Diff line
/* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

/*! \file */

#ifndef HIP_MAT_H
#define HIP_MAT_H


#include <hip/hip_runtime.h>
#include "hip_memory.h"

/// Namespace for CUDA Driver routines
namespace ucl_hip {

#define _UCL_MAT_ALLOW
#define _UCL_DEVICE_PTR_MAT
#include "ucl_basemat.h"
#include "ucl_h_vec.h"
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#include "ucl_s_obj_help.h"
#include "ucl_vector.h"
#include "ucl_matrix.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _UCL_MAT_ALLOW

#define UCL_COPY_ALLOW
#include "ucl_copy.h"
#undef UCL_COPY_ALLOW

#define UCL_PRINT_ALLOW
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW

} // namespace ucl_cudadr

#endif
Loading