Unverified Commit c31d7c6f authored by Steve Plimpton's avatar Steve Plimpton Committed by GitHub
Browse files

Merge pull request #1009 from ndtrung81/gpu-maint

Fixed bugs to the tersoff gpu styles for OpenCL builds and some maintenance
parents 45532b71 923ae041
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -260,6 +260,9 @@ class UCL_Device {
  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

  /// Select the platform that has accelerators (for compatibility with OpenCL)
  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }

 private:
  int _device, _num_devices;
  std::vector<NVDProperties> _properties;
+4 −0
Original line number Diff line number Diff line
@@ -322,10 +322,12 @@ class Atom {

  // Copy charges to device asynchronously
  inline void add_q_data() {
    time_q.start();
    if (_q_avail==false) {
      q.update_device(_nall,true);
      _q_avail=true;
    }
    time_q.stop();
  }

  // Cast quaternions to write buffer
@@ -347,10 +349,12 @@ class Atom {
  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
    time_quat.start();
    if (_quat_avail==false) {
      quat.update_device(_nall*4,true);
      _quat_avail=true;
    }
    time_quat.stop();
  }

  /// Cast velocities and tags to write buffer
+8 −0
Original line number Diff line number Diff line
@@ -130,8 +130,16 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,

  // Time on the device only if 1 proc per gpu
  _time_device=true;

#if 0
  // XXX: the following setting triggers a memory leak with OpenCL and MPI
  //      setting _time_device=true for all processes doesn't seem to be a
  //      problem with either (no segfault, no (large) memory leak.
  //      thus keeping this disabled for now. may need to review later.
  //      2018-07-23 <akohlmey@gmail.com>
  if (_procs_per_gpu>1)
    _time_device=false;
#endif

  // Set up a per device communicator
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
+7 −7
Original line number Diff line number Diff line
@@ -127,10 +127,10 @@ void Neighbor::alloc(bool &success) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         _packed_permissions)==UCL_SUCCESS);
    dev_acc.clear();
    success=success && (dev_acc.alloc(_max_atoms,*dev,
    dev_ilist.clear();
    success=success && (dev_ilist.alloc(_max_atoms,*dev,
                                      UCL_READ_WRITE)==UCL_SUCCESS);
    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
    _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes();
  }
  if (_max_host>0) {
    nbor_host.clear();
@@ -197,7 +197,7 @@ void Neighbor::clear() {

    host_packed.clear();
    host_acc.clear();
    dev_acc.clear();
    dev_ilist.clear();
    dev_nbor.clear();
    nbor_host.clear();
    dev_packed.clear();
@@ -281,7 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
  ucl_copy(acc_view,host_acc,true);
  ucl_copy(acc_view,host_acc,inum*2,true);

  UCL_H_Vec<int> host_view;
  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
@@ -289,7 +289,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
    int i=ilist[ii];
    host_view[i] = ii;
  }
  ucl_copy(dev_acc,host_view,true);
  ucl_copy(dev_ilist,host_view,true);

  time_nbor.stop();

@@ -364,7 +364,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
  ucl_copy(acc_view,host_acc,true);
  ucl_copy(acc_view,host_acc,inum*2,true);
  time_nbor.stop();

  if (_use_packing==false) {
+2 −2
Original line number Diff line number Diff line
@@ -110,7 +110,7 @@ class Neighbor {
      }
      if (_time_device) {
        time_nbor.add_to_total();
        time_kernel.add_to_total();
        if (_use_packing==false) time_kernel.add_to_total();
        if (_gpu_nbor==2) {
          time_hybrid1.add_to_total();
          time_hybrid2.add_to_total();
@@ -200,7 +200,7 @@ class Neighbor {
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
  /// Device storage for accessing atom indices from the neighbor list (3-body)
  UCL_D_Vec<int> dev_acc;
  UCL_D_Vec<int> dev_ilist;

  // ----------------- Data for GPU Neighbor Calculation ---------------

Loading