Commit a2f9fa8e authored by Vsevak's avatar Vsevak
Browse files

Separate the computation of hneigh into another kernel

Simplify the main GPU kernel and add another kernel 'k_pair_reneigh'. It works good on GTX1070 (Pascal), but still there is a problem with non-deterministic results on Volta.

I reimplement BaseCharge::compute methods in the child class LJ_TIP4PLong to correctly embed a new kernel in the code.

Also commit includes some codestyle fixes.
parent 66a076b8
Loading
Loading
Loading
Loading
+143 −17
Original line number Diff line number Diff line
@@ -24,26 +24,26 @@ const char *lj_tip4p=0;
#include "lal_lj_tip4p_long.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJ_TIP4PLong_T LJ_TIP4PLong<numtyp, acctyp>
#define LJTIP4PLongT LJ_TIP4PLong<numtyp, acctyp>

extern Device<PRECISION,ACC_PRECISION> device;

template <class numtyp, class acctyp>
LJ_TIP4PLong<numtyp, acctyp>::LJ_TIP4PLong(): BaseCharge<numtyp,acctyp>(), _allocated(false) {
LJTIP4PLongT::LJ_TIP4PLong(): BaseCharge<numtyp,acctyp>(), _allocated(false) {
}

template <class numtyp, class acctyp>
LJ_TIP4PLong<numtyp, acctyp>::~LJ_TIP4PLong() {
LJTIP4PLongT::~LJ_TIP4PLong() {
  clear();
}

template <class numtyp, class acctyp>
int LJ_TIP4PLong<numtyp, acctyp>::bytes_per_atom(const int max_nbors) const {
int LJTIP4PLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
}

template <class numtyp, class acctyp>
int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
int LJTIP4PLongT::init(const int ntypes,
    double **host_cutsq, double **host_lj1,
    double **host_lj2, double **host_lj3,
    double **host_lj4, double **host_offset,
@@ -65,6 +65,7 @@ int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
  if (success!=0)
    return success;
  k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
  k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");

  TypeH = tH;
  TypeO = tO;
@@ -143,7 +144,7 @@ int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,


template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::clear() {
void LJTIP4PLongT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
@@ -161,12 +162,13 @@ void LJ_TIP4PLong<numtyp, acctyp>::clear() {
  //force_comp.clear();

  k_pair_distrib.clear();
  k_pair_reneigh.clear();

  this->clear_atomic();
}

template <class numtyp, class acctyp>
double LJ_TIP4PLong<numtyp, acctyp>::host_memory_usage() const {
double LJTIP4PLongT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(LJ_TIP4PLong<numtyp,acctyp>);
}

@@ -174,7 +176,7 @@ double LJ_TIP4PLong<numtyp, acctyp>::host_memory_usage() const {
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
@@ -188,13 +190,24 @@ void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;

  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
  const int nall = this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  int GX;

  if (t_ago == 0) {
    GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
    this->k_pair_reneigh.set_size(GX,BX);
    this->k_pair_reneigh.run(&this->atom->x,
        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
        &nall, &ainum,&nbor_pitch, &this->_threads_per_atom,
        &hneight, &m, &TypeO, &TypeH,
        &tag, &map_array, &atom_sametag);
  }

  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  this->k_pair.set_size(GX,BX);
  if (vflag){
	  this->ansO.resize_ib(ainum*3);
@@ -213,17 +226,16 @@ void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
          &atom_sametag, &this->ansO);
  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
  this->k_pair_distrib.set_size(GX,BX);
  this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv, &eflag, &vflag,
      &ainum, &nbor_pitch, &this->_threads_per_atom,
      &hneight, &m, &TypeO, &TypeH, &alpha,
      &this->atom->q,  &this->ansO);
  this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv,
      &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
      &hneight, &m, &TypeO, &TypeH, &alpha,&this->atom->q,  &this->ansO);
  this->time_pair.stop();
}


template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::copy_relations_data(int **hn, double **newsite, int n,
    int* tag, int *map_array, int map_size, int *sametag, int max_same, int ago){
void LJTIP4PLongT::copy_relations_data(int n, int* tag, int *map_array,
                      int map_size, int *sametag, int max_same, int ago){
  int nall = n;
  const int hn_sz = n*4; // matrix size = col size * col number
  hneight.resize_ib(hn_sz);
@@ -250,4 +262,118 @@ void LJ_TIP4PLong<numtyp, acctyp>::copy_relations_data(int **hn, double **newsit
  host_tag_write.clear();
}




// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
                               const int nall, double **host_x, int *host_type,
                               int *ilist, int *numj, int **firstneigh,
                               const bool eflag, const bool vflag,
                               const bool eatom, const bool vatom,
                               int &host_start, const double cpu_time,
                               bool &success, double *host_q,
                               const int nlocal, double *boxlo, double *prd) {
  this->acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    this->resize_atom(0,nall,success);
    this->zero_timers();
    return;
  }

  int ago=this->hd_balancer.ago_first(f_ago);
  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
  this->ans->inum(inum);
  host_start=inum;

  if (ago==0) {
    this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
    if (!success)
      return;
  }

  this->atom->cast_x_data(host_x,host_type);
  this->atom->cast_q_data(host_q);
  this->hd_balancer.start_timer();
  this->atom->add_x_data(host_x,host_type);
  this->atom->add_q_data();

  this->device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

  t_ago = ago;
  loop(eflag,vflag);
  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  this->device->add_ans_object(this->ans);
  this->hd_balancer.stop_timer();
}

// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** LJTIP4PLongT::compute(const int ago, const int inum_full,
                                const int nall, double **host_x, int *host_type,
                                double *sublo, double *subhi, tagint *tag,
                                int *map_array, int map_size, int *sametag, int max_same,
                                int **nspecial, tagint **special, const bool eflag,
                                const bool vflag, const bool eatom,
                                const bool vatom, int &host_start,
                                int **ilist, int **jnum,
                                const double cpu_time, bool &success,
                                double *host_q, double *boxlo, double *prd) {
  this->acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    this->resize_atom(0,nall,success);
    this->zero_timers();
    return NULL;
  }

  this->hd_balancer.balance(cpu_time);
  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
  this->ans->inum(inum);
  host_start=inum;

  // Build neighbor list on GPU if necessary
  if (ago==0) {
    this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    this->atom->cast_q_data(host_q);
    this->hd_balancer.start_timer();
  } else {
    this->atom->cast_x_data(host_x,host_type);
    this->atom->cast_q_data(host_q);
    this->hd_balancer.start_timer();
    this->atom->add_x_data(host_x,host_type);
  }
  this->atom->add_q_data();
  *ilist=this->nbor->host_ilist.begin();
  *jnum=this->nbor->host_acc.begin();

  copy_relations_data(nall, tag, map_array, map_size, sametag, max_same, ago);

  this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

  t_ago = ago;
  loop(eflag,vflag);
  this->ans->copy_answers(eflag,vflag,eatom,vatom);
  this->device->add_ans_object(this->ans);
  this->hd_balancer.stop_timer();

  return this->nbor->host_jlist.begin()-host_start;
}




template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;
+66 −73
Original line number Diff line number Diff line
@@ -158,6 +158,62 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
  } // if ii
}

__kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
    const __global int * dev_nbor,
    const __global int * dev_packed,
    const int nall, const int inum,
    const int nbor_pitch, const int t_per_atom,
    __global int *restrict hneigh,
    __global numtyp4 *restrict m,
    const int typeO, const int typeH,
    const __global int *restrict tag, const __global int *restrict map,
    const __global int *restrict sametag) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  int i = BLOCK_ID_X*(BLOCK_SIZE_X)+tid;

  if (i<nall) {
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];

    int iH1, iH2, iO;
    int itype = ix.w;
    if(itype == typeO) {
      iO  = i;
      if (hneigh[i*4+2] != -1) {
        iH1 = atom_mapping(map, tag[i] + 1);
        iH2 = atom_mapping(map, tag[i] + 2);
        // set iH1,iH2 to closest image to O
        iH1 = closest_image(i, iH1, sametag, x_);
        iH2 = closest_image(i, iH2, sametag, x_);
        hneigh[i*4  ] = iH1;
        hneigh[i*4+1] = iH2;
        hneigh[i*4+2] = -1;
      }
    } else {
      if (hneigh[i*4+2] != -1) {
        int iI, iH;
        iI = atom_mapping(map,tag[i] - 1);
        numtyp4 iIx; fetch4(iIx,iI,pos_tex); //x_[iI];
        if ((int)iIx.w == typeH) {
          iO = atom_mapping(map,tag[i] - 2);
          iO  = closest_image(i, iO, sametag, x_);
          iH1 = closest_image(i, iI, sametag, x_);
          iH2 = i;
        } else { //if ((int)iIx.w == typeO)
          iH = atom_mapping(map, tag[i] + 1);
          iO  = closest_image(i,iI,sametag, x_);
          iH1 = i;
          iH2 = closest_image(i,iH,sametag, x_);
        }
        hneigh[i*4+0] = iO;
        hneigh[i*4+1] += -1;
        hneigh[i*4+2] = -1;
      }
    }
  }
}


__kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    const __global numtyp4 *restrict lj1,
    const __global numtyp4 *restrict lj3,
@@ -211,63 +267,17 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,

    if(itype == typeO) {
      iO  = i;
      if (hneigh[i*4+2] != -1) {
        iH1 = atom_mapping(map, tag[i] + 1);
        iH2 = atom_mapping(map, tag[i] + 2);
        // set iH1,iH2 to closest image to O
        iH1 = closest_image(i, iH1, sametag, x_);
        iH2 = closest_image(i, iH2, sametag, x_);
        hneigh[  i*4  ] = iH1;
        hneigh[  i*4+1] = iH2;
        hneigh[  i*4+2] = -1;
        hneigh[iH1*4  ] = i;
        hneigh[iH1*4+1] += -1;
        hneigh[iH1*4+2] = -1;
        hneigh[iH2*4  ] = i;
        hneigh[iH2*4+1] += -1;
        hneigh[iH2*4+2] = -1;
      } else {
      iH1 = hneigh[i*4  ];
      iH2 = hneigh[i*4+1];
      }
      if(fabs(m[iO].w) <= eq_zero) {
        compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
        m[iO].w = qtmp;
      }
      x1 = m[iO];
    } else {
      if (hneigh[i*4+2] != -1) {
        int iI, iH;
        iI = atom_mapping(map,tag[i] - 1);
        numtyp4 iIx; fetch4(iIx,iI,pos_tex); //x_[iI];
        if ((int)iIx.w == typeH) {
          iO = atom_mapping(map,tag[i] - 2);
          iO  = closest_image(i, iO, sametag, x_);
          iH1 = closest_image(i, iI, sametag, x_);
          iH2 = i;
        } else { //if ((int)iIx.w == typeO)
          iH = atom_mapping(map, tag[i] + 1);
          iO  = closest_image(i,iI,sametag, x_);
          iH1 = i;
          iH2 = closest_image(i,iH,sametag, x_);
        }

        hneigh[iH1*4+0] = iO;
        hneigh[iH1*4+1] += -1;
        hneigh[iH1*4+2] = -1;

        hneigh[iH2*4+0] = iO;
        hneigh[iH2*4+1] += -1;
        hneigh[iH2*4+2] = -1;

        hneigh[iO*4+0] = iH1;
        hneigh[iO*4+1] = iH2;
        hneigh[iO*4+2] = -1;
    } else {
      iO  = hneigh[i *4  ];
      iH1 = hneigh[iO*4  ];
      iH2 = hneigh[iO*4+1];
      }
      if (iO >= inum) {
        non_local_oxy = 1;
        if(fabs(m[iO].w) <= eq_zero) {
@@ -327,25 +337,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
        if(itype == typeO || jtype == typeO) {
          if (jtype == typeO) {
            jO = j;
            if (hneigh[j*4+2] != -1) {
              jH1 = atom_mapping(map,tag[j] + 1);
              jH2 = atom_mapping(map,tag[j] + 2);
              // set iH1,iH2 to closest image to O
              jH1 = closest_image(j, jH1, sametag, x_);
              jH2 = closest_image(j, jH2, sametag, x_);
              hneigh[j*4  ] = jH1;
              hneigh[j*4+1] = jH2;
              hneigh[j*4+2] = -1;
              hneigh[jH1*4  ] = j;
              hneigh[jH1*4+1] += -1;
              hneigh[jH1*4+2] = -1;
              hneigh[jH2*4  ] = j;
              hneigh[jH2*4+1] += -1;
              hneigh[jH2*4+2] = -1;
            } else {
            jH1 = hneigh[j*4  ];
            jH2 = hneigh[j*4+1];
            }
            if (fabs(m[j].w) <= eq_zero) {
              compute_newsite(j, jH1, jH2, &m[j], alpha, x_);
              m[j].w = qj;
@@ -405,7 +398,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
                vdj.w = vdj.w;
                //vdj.w = vdj.w;
                virial[0] += (ix.x - vdj.x)*fd.x;
                virial[1] += (ix.y - vdj.y)*fd.y;
                virial[2] += (ix.z - vdj.z)*fd.z;
@@ -422,7 +415,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
              vdi.x = xO.x*cO + xH1.x*cH + xH2.x*cH;
              vdi.y = xO.y*cO + xH1.y*cH + xH2.y*cH;
              vdi.z = xO.z*cO + xH1.z*cH + xH2.z*cH;
              vdi.w = vdi.w;
              //vdi.w = vdi.w;
              if (jtype != typeH){
                numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
                numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
@@ -430,7 +423,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
                vdj.w = vdj.w;
                //vdj.w = vdj.w;
              } else vdj = jx;
              vO[0] += 0.5*(vdi.x - vdj.x)*fd.x;
              vO[1] += 0.5*(vdi.y - vdj.y)*fd.y;
+23 −4
Original line number Diff line number Diff line
@@ -62,10 +62,29 @@ public:
  /// Total host memory used by library for pair style
  double host_memory_usage() const;

  /// Copy data which is easier to compute in LAMMPS_NS
  void copy_relations_data(int **hn, double **m, int n,int* tag, int *map_array, int map_size,
  /// Copy data from LAMMPS_NS
  void copy_relations_data(int n,int* tag, int *map_array, int map_size,
      int *sametag, int max_same, int ago);

  /// Reimplement BaseCharge pair loop with host neighboring
  void compute(const int f_ago, const int inum_full, const int nall,
               double **host_x, int *host_type, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, double *charge,
               const int nlocal, double *boxlo, double *prd);

  /// Reimplement BaseCharge pair loop with device neighboring
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag,int *map_array, int map_size, int *sametag, int max_same,
                int **nspecial,
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double *charge, double *boxlo, double *prd);


  // --------------------------- TYPE DATA --------------------------

  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw
@@ -77,7 +96,6 @@ public:
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;

  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

  /// Number of atom types
@@ -98,10 +116,11 @@ public:
  UCL_D_Vec<int> map_array;
  UCL_D_Vec<int> atom_sametag;

  UCL_Kernel k_pair_distrib;
  UCL_Kernel k_pair_distrib, k_pair_reneigh;

 private:
  bool _allocated;
  int t_ago;
  void loop(const bool _eflag, const bool _vflag);
};

+8 −4
Original line number Diff line number Diff line
@@ -112,14 +112,18 @@ void ljtip4p_long_gpu_clear() {

int ** ljtip4p_long_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                        double *sublo, double *subhi,
                        tagint *tag, int *map_array, int map_size,
                        int *sametag, int max_same,
                        int **nspecial,
                        tagint **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        int **ilist, int **jnum, const double cpu_time,
                        bool &success, double *host_q, double *boxlo,
                        double *prd) {
  return LJTIP4PLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       subhi, tag, map_array, map_size, sametag, max_same,
                       nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q,boxlo, prd);
}
@@ -139,10 +143,10 @@ double ljtip4p_long_gpu_bytes() {
  return LJTIP4PLMF.host_memory_usage();
}

void ljtip4p_long_copy_molecule_data(int **hn, double **m, int n, int* tag,
void ljtip4p_long_copy_molecule_data(int n, int* tag,
    int *map_array, int map_size,
    int *sametag, int max_same, int ago){
  LJTIP4PLMF.copy_relations_data(hn, m, n, tag,map_array,map_size,sametag, max_same, ago);
  LJTIP4PLMF.copy_relations_data(n, tag, map_array, map_size, sametag, max_same, ago);
}

+12 −7
Original line number Diff line number Diff line
@@ -64,7 +64,10 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
void ljtip4p_long_gpu_clear();
int ** ljtip4p_long_gpu_compute_n(const int ago, const int inum,
    const int nall, double **host_x, int *host_type,
    double *sublo, double *subhi, tagint *tag, int **nspecial,
    double *sublo, double *subhi,
    tagint *tag, int *map_array, int map_size,
    int *sametag, int max_same,
    int **nspecial,
    tagint **special, const bool eflag, const bool vflag,
    const bool eatom, const bool vatom, int &host_start,
    int **ilist, int **jnum,
@@ -78,7 +81,7 @@ void ljtip4p_long_gpu_compute(const int ago, const int inum, const int nall,
    bool &success, double *host_q, const int nlocal,
    double *boxlo, double *prd);
double ljtip4p_long_gpu_bytes();
void ljtip4p_long_copy_molecule_data(int **, double **, int, int* , int *,
void ljtip4p_long_copy_molecule_data(int, int* , int *,
		int, int *, int , int);

/* ---------------------------------------------------------------------- */
@@ -112,17 +115,16 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag)
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;

  ljtip4p_long_copy_molecule_data(hneigh, newsite, nall, atom->tag,
      atom->get_map_array(), atom->get_map_size(),
      atom->sametag, atom->get_max_same(), neighbor->ago);

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
  if (gpu_mode != GPU_FORCE) {
    inum = atom->nlocal;
    firstneigh = ljtip4p_long_gpu_compute_n(neighbor->ago, inum, nall,
        atom->x, atom->type, domain->sublo,
        domain->subhi, atom->tag, atom->nspecial,
        domain->subhi,
        atom->tag, atom->get_map_array(), atom->get_map_size(),
        atom->sametag, atom->get_max_same(),
        atom->nspecial,
        atom->special, eflag, vflag, eflag_atom,
        vflag_atom, host_start, &ilist, &numneigh,
        cpu_time, success, atom->q, domain->boxlo,
@@ -132,6 +134,9 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag)
    ilist = list->ilist;
    numneigh = list->numneigh;
    firstneigh = list->firstneigh;
    ljtip4p_long_copy_molecule_data(nall, atom->tag,
        atom->get_map_array(), atom->get_map_size(),
        atom->sametag, atom->get_max_same(), neighbor->ago);
    ljtip4p_long_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
        ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
        vflag_atom, host_start, cpu_time, success, atom->q,