Commit 4a8146ad authored by Axel Kohlmeyer's avatar Axel Kohlmeyer
Browse files

import patch posted by christopher knight to lammps-users on august 21st

parent 39df9f5d
Loading
Loading
Loading
Loading
+69 −34
Original line number Diff line number Diff line
@@ -493,48 +493,78 @@ struct remap_plan_3d *remap_3d_create_plan(
      }
    }

    // Note:
    //  if we could use a hint for cases where we know all processors will be
    //  included in commringlist, then we can just duplicate the comm communicator
    //  and not execute the following O(Nproc^2) commringlist logic.

    // collide all inarray extents for the comm ring with all output
    // extents and all outarray extents for the comm ring with all input
    // extents - if there is a collison add the rank to the comm ring,
    // keep iterating until nothing is added to commring

    int istart = 0;
    int iend   = commringlen;

    int * list_found = (int *) malloc(nprocs*sizeof(int));
    for (int i=0; i<nprocs; ++i) list_found[i] = 0;
    for (int i=0; i<commringlen; ++i) list_found[commringlist[i]] = 1;

    int num_remaining_rnks = nprocs;
    int * remaining_rnks = (int *) malloc(nprocs*sizeof(int));
    for (int i=0; i<nprocs; ++i) remaining_rnks[i] = i;

    int commringappend = 1;
    while (commringappend) {
      int newcommringlen = commringlen;
      commringappend = 0;
      for (int i=0;i<commringlen;i++) {
        for (int j=0;j<nprocs;j++) {
          if (remap_3d_collide(&inarray[commringlist[i]],
                               &outarray[j],&overlap)) {
            int alreadyinlist = 0;
            for (int k=0;k<newcommringlen;k++) {
              if (commringlist[k] == j) {
                alreadyinlist = 1;
              }

      // shrink list of remaining ranks to test

      int num_remaining_rnks_new = 0;
      for (int k=0; k<num_remaining_rnks; ++k) {
        const int rnk = remaining_rnks[k];
        if(!list_found[rnk]) remaining_rnks[num_remaining_rnks_new++] = rnk;
      }
            if (!alreadyinlist) {

      for (int i=istart; i<iend; i++)
        for (int jj=0; jj<num_remaining_rnks_new; ++jj) {
          const int j = remaining_rnks[jj];

          if (remap_3d_collide(&inarray[commringlist[i]], &outarray[j], &overlap) ||
              remap_3d_collide(&outarray[commringlist[i]], &inarray[j], &overlap) ) {
            commringlist[newcommringlen++] = j;
              commringappend = 1;
            if (newcommringlen == nprocs) goto endloop;
          }

        }
          if (remap_3d_collide(&outarray[commringlist[i]],
                               &inarray[j],&overlap)) {
            int alreadyinlist = 0;
            for (int k=0;k<newcommringlen;k++) {
              if (commringlist[k] == j) alreadyinlist = 1;
            }
            if (!alreadyinlist) {
              commringlist[newcommringlen++] = j;

      // update list of ranks found

      if (newcommringlen > commringlen) {
        commringappend = 1;
        for (int i=commringlen; i<newcommringlen; ++i) list_found[commringlist[i]] = 1;
      }
          }
        }
      }

    endloop:

      // reset indices to only test newest ranks added

      istart = commringlen;
      iend   = newcommringlen;
      commringlen = newcommringlen;
      if (commringlen == nprocs) commringappend = 0;
    }

    free(list_found);
    free(remaining_rnks);

    // sort the final commringlist

    if (commringlen == nprocs) {
      for (int i=0; i<commringlen; ++i) commringlist[i] = i;
    } else {

      for (int c = 0 ; c < ( commringlen - 1 ); c++) {
        for (int d = 0 ; d < commringlen - c - 1; d++) {
          if (commringlist[d] > commringlist[d+1]) {
@@ -544,6 +574,7 @@ struct remap_plan_3d *remap_3d_create_plan(
          }
        }
      }
    }

    // resize commringlist to final size

@@ -684,14 +715,18 @@ int remap_3d_collide(struct extent_3d *block1, struct extent_3d *block2,
{
  overlap->ilo = MAX(block1->ilo,block2->ilo);
  overlap->ihi = MIN(block1->ihi,block2->ihi);

  if (overlap->ilo > overlap->ihi) return 0;

  overlap->jlo = MAX(block1->jlo,block2->jlo);
  overlap->jhi = MIN(block1->jhi,block2->jhi);

  if (overlap->jlo > overlap->jhi) return 0;

  overlap->klo = MAX(block1->klo,block2->klo);
  overlap->khi = MIN(block1->khi,block2->khi);

  if (overlap->ilo > overlap->ihi ||
      overlap->jlo > overlap->jhi ||
      overlap->klo > overlap->khi) return 0;
  if (overlap->klo > overlap->khi) return 0;

  overlap->isize = overlap->ihi - overlap->ilo + 1;
  overlap->jsize = overlap->jhi - overlap->jlo + 1;
+146 −4
Original line number Diff line number Diff line
@@ -46,6 +46,8 @@ enum{ONELEVEL,TWOLEVEL,NUMA,CUSTOM};
enum{CART,CARTREORDER,XYZ};
enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files

#define MAX_RING_NEIGHBORS 27

/* ---------------------------------------------------------------------- */

Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
@@ -73,6 +75,9 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  xsplit = ysplit = zsplit = NULL;
  rcbnew = 0;

  neighborflag = 0;
  ring_neighbors = NULL;

  // use of OpenMP threads
  // query OpenMP for number of threads/process set by user at run-time
  // if the OMP_NUM_THREADS environment variable is not set, we default
@@ -119,6 +124,8 @@ Comm::~Comm()
  memory->destroy(cutusermulti);
  delete [] customfile;
  delete [] outfile;

  if (ring_neighbors) memory->destroy(ring_neighbors);
}

/* ----------------------------------------------------------------------
@@ -302,6 +309,9 @@ void Comm::modify_params(int narg, char **arg)
      else if (strcmp(arg[iarg+1],"no") == 0) ghost_velocity = 0;
      else error->all(FLERR,"Illegal comm_modify command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"ring_neighbor") == 0) {
      neighborflag = 1;
      iarg++;
    } else error->all(FLERR,"Illegal comm_modify command");
  }
}
@@ -585,6 +595,63 @@ void Comm::set_proc_grid(int outflag)
               universe->uworld);
    }
  }

  // if not uniform layout, then de-activate until support added

  if (neighborflag && layout != LAYOUT_UNIFORM) neighborflag = 0;

  // if using neighbor comm pattern instead of ring() over procs

  if (neighborflag) {

    // maximum number of potential neighbors

    memory->create(ring_neighbors, MAX_RING_NEIGHBORS, "comm:ring_neighbors");

    // Identify MPI ranks of neighboring 26 sub-domains

    int indx = 0;
    for (int i=-1; i<2; ++i) {
      int x = myloc[0] + i;
      if (x == -1         ) x = procgrid[0] - 1;
      if (x == procgrid[0]) x = 0;

      for (int j=-1; j<2; ++j) {
        int y = myloc[1] + j;
        if (y == -1         ) y = procgrid[1] - 1;
        if (y == procgrid[1]) y = 0;

        for (int k=-1; k<2; ++k) {
          int z = myloc[2] + k;
          if (z == -1         ) z = procgrid[2] - 1;
          if (z == procgrid[2]) z = 0;

          ring_neighbors[indx++] = grid2proc[x][y][z];
        }

      }

    }

    // due to pbc, there may be duplicates; find unique rank ids and eliminate self

    indx = 0;
    for (int i=0; i<MAX_RING_NEIGHBORS; ++i) {
      int new_entry = 1;
      if (ring_neighbors[i] == me) new_entry = 0;
      for (int j=0; j<indx; ++j)
        if (ring_neighbors[j] == ring_neighbors[i]) new_entry = 0;
      if (new_entry) {
        ring_neighbors[indx] = ring_neighbors[i];
        indx++;
      }
    }
    num_ring_neighbors = indx;

    // current implementation only makes sense if 26 unique neighbors

    if (num_ring_neighbors != MAX_RING_NEIGHBORS-1) neighborflag = 0;
  }
}

/* ----------------------------------------------------------------------
@@ -685,6 +752,14 @@ void Comm::ring(int n, int nper, void *inbuf, int messtag,
                void (*callback)(int, char *, void *),
                void *outbuf, void *ptr, int self)
{
  // intercept all ring() calls if only communicating with neighbors
  // -- good chance this isn't appropriate for 100% use cases...

  if (neighborflag) {
    ring_neighbor(n, nper, inbuf, messtag, callback, outbuf, ptr, self);
    return;
  }

  MPI_Request request;
  MPI_Status status;

@@ -723,6 +798,73 @@ void Comm::ring(int n, int nper, void *inbuf, int messtag,
  memory->destroy(bufcopy);
}

/* ----------------------------------------------------------------------
   communicate inbuf only to closest neighboring processors with messtag
   nbytes = size of inbuf = n datums * nper bytes
   callback() is invoked to allow caller to process/update each proc's inbuf
   if self=1 (default), then callback() is invoked on final iteration
     using original inbuf, which may have been updated
   for non-NULL outbuf, final updated inbuf is copied to it
     ok to specify outbuf = inbuf
   the ptr argument is a pointer to the instance of calling class
------------------------------------------------------------------------- */

void Comm::ring_neighbor(int n, int nper, void *inbuf, int messtag,
                         void (*callback)(int, char *, void *),
                         void *outbuf, void *ptr, int self)
{
  MPI_Request request;
  MPI_Status status;

  int nbytes = n*nper;
  int maxbytes;
  MPI_Allreduce(&nbytes,&maxbytes,1,MPI_INT,MPI_MAX,world);

  // no need to communicate without data

  if (maxbytes == 0) return;

  char *buf,*bufcopy;
  memory->create(buf,    maxbytes,"comm:buf");
  memory->create(bufcopy,maxbytes,"comm:bufcopy");
  memcpy(buf, inbuf, nbytes);

  // loop over nearest neighbors ping-ponging buf
  // -- if smarter, could have buf go around the ring to reduce MPI 2x (worth effort?)

  int irecv = num_ring_neighbors - 1;
  for (int isend=0; isend<num_ring_neighbors; ++isend) {

    // forward pass buf

    MPI_Irecv(bufcopy, maxbytes, MPI_CHAR, ring_neighbors[irecv], messtag, world, &request);
    MPI_Send(buf,      nbytes,   MPI_CHAR, ring_neighbors[isend], messtag, world);
    MPI_Wait(&request, &status);
    MPI_Get_count(&status, MPI_CHAR, &nbytes);

    // process buf

    callback(nbytes/nper, bufcopy, ptr);

    // reverse pass buf

    MPI_Irecv(buf,    maxbytes, MPI_CHAR, ring_neighbors[isend], messtag, world, &request);
    MPI_Send(bufcopy, nbytes,   MPI_CHAR, ring_neighbors[irecv], messtag, world);
    MPI_Wait(&request, &status);
    MPI_Get_count(&status, MPI_CHAR, &nbytes);

    irecv--;
  }

  // self process

  if (self) callback(nbytes/nper, buf, ptr);
  if (outbuf) memcpy(outbuf,buf,nbytes);

  memory->destroy(buf);
  memory->destroy(bufcopy);
}

/* ----------------------------------------------------------------------
   proc 0 reads Nlines from file into buf and bcasts buf to all procs
   caller allocates buf to max size needed
+7 −0
Original line number Diff line number Diff line
@@ -110,6 +110,9 @@ class Comm : protected Pointers {
  int read_lines_from_file(FILE *, int, int, char *);
  int read_lines_from_file_universe(FILE *, int, int, char *);

  void ring_neighbor(int, int, void *, int, void (*)(int, char *, void *),
                     void *, void *, int self = 1);

 protected:
  int bordergroup;           // only communicate this group in borders

@@ -137,6 +140,10 @@ class Comm : protected Pointers {
  int ncores;                       // # of cores per node
  int coregrid[3];                  // 3d grid of cores within a node
  int user_coregrid[3];             // user request for cores in each dim

  int neighborflag;                 // 1 if use neighbor only ring communication
  int num_ring_neighbors;           // number of unique neighbors
  int * ring_neighbors;             // list of neighboring ranks
};

}
+360 −73
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ void Replicate::command(int narg, char **arg)

  if (domain->box_exist == 0)
    error->all(FLERR,"Replicate command before simulation box is defined");
  if (narg != 3) error->all(FLERR,"Illegal replicate command");
  if (narg < 3 || narg > 4) error->all(FLERR,"Illegal replicate command");

  int me = comm->me;
  int nprocs = comm->nprocs;
@@ -58,6 +58,10 @@ void Replicate::command(int narg, char **arg)
  int nz = force->inumeric(FLERR,arg[2]);
  int nrep = nx*ny*nz;

  int use_more_memory = 0;
  if(narg == 4)
    if(strcmp(arg[3],"memory") == 0) use_more_memory = 1;

  // error and warning checks

  if (nx <= 0 || ny <= 0 || nz <= 0)
@@ -99,6 +103,37 @@ void Replicate::command(int narg, char **arg)
    maxmol = maxmol_all;
  }

  // check image flags maximum extent; only efficient small image flags compared to new system

  int _imagelo[3], _imagehi[3];
  _imagelo[0] = 0;
  _imagelo[1] = 0;
  _imagelo[2] = 0;
  _imagehi[0] = 0;
  _imagehi[1] = 0;
  _imagehi[2] = 0;

  if(use_more_memory) {

    for (i=0; i<atom->nlocal; ++i) {
      imageint image = atom->image[i];
      int xbox = (image & IMGMASK) - IMGMAX;
      int ybox = (image >> IMGBITS & IMGMASK) - IMGMAX;
      int zbox = (image >> IMG2BITS) - IMGMAX;

      if(xbox < _imagelo[0]) _imagelo[0] = xbox;
      if(ybox < _imagelo[1]) _imagelo[1] = ybox;
      if(zbox < _imagelo[2]) _imagelo[2] = zbox;

      if(xbox > _imagehi[0]) _imagehi[0] = xbox;
      if(ybox > _imagehi[1]) _imagehi[1] = ybox;
      if(zbox > _imagehi[2]) _imagehi[2] = zbox;
    }

    MPI_Allreduce(MPI_IN_PLACE, &(_imagelo[0]), 3, MPI_INT, MPI_MIN, world);
    MPI_Allreduce(MPI_IN_PLACE, &(_imagehi[0]), 3, MPI_INT, MPI_MAX, world);
  }

  // unmap existing atoms via image flags

  for (i = 0; i < atom->nlocal; i++)
@@ -280,6 +315,256 @@ void Replicate::command(int narg, char **arg)
  double *coord;
  int tag_enable = atom->tag_enable;

  if(use_more_memory) {

    // allgather size of buf on each proc

    n = 0;
    for (i = 0; i < old->nlocal; i++) n += old_avec->pack_restart(i,&buf[n]);

    int * size_buf_rnk;
    memory->create(size_buf_rnk, nprocs, "replicate:size_buf_rnk");

    MPI_Allgather(&n, 1, MPI_INT, size_buf_rnk, 1, MPI_INT, world);

    // size of buf_all

    int size_buf_all = 0;
    MPI_Allreduce(&n, &size_buf_all, 1, MPI_INT, MPI_SUM, world);

    if(me == 0 && screen) {
      fprintf(screen,"Replicate::bounding box image: lo= %i %i %i  hi= %i %i %i\n",
              _imagelo[0],_imagelo[1],_imagelo[2],_imagehi[0],_imagehi[1],_imagehi[2]);
      fprintf(screen,"Replicate:: buf_all memory allocating %10.2f MB\n",
              (double)size_buf_all*sizeof(double)/1024/1024);
    }

    // rnk offsets

    int * disp_buf_rnk;
    memory->create(disp_buf_rnk, nprocs, "replicate:disp_buf_rnk");
    disp_buf_rnk[0] = 0;
    for (int i=1; i<nprocs; ++i) disp_buf_rnk[i] = disp_buf_rnk[i-1] + size_buf_rnk[i-1];

    // allgather buf_all

    double * buf_all;
    memory->create(buf_all, size_buf_all, "replicate:buf_all");

    MPI_Allgatherv(buf, n, MPI_DOUBLE, buf_all, size_buf_rnk, disp_buf_rnk, MPI_DOUBLE, world);

    // bounding box of original unwrapped system

    double _orig_lo[3], _orig_hi[3];
    _orig_lo[0] = domain->boxlo[0] + _imagelo[0] * old_xprd;
    _orig_lo[1] = domain->boxlo[1] + _imagelo[1] * old_yprd;
    _orig_lo[2] = domain->boxlo[2] + _imagelo[2] * old_zprd;

    _orig_hi[0] = domain->boxlo[0] + (_imagehi[0]+1) * old_xprd;
    _orig_hi[1] = domain->boxlo[1] + (_imagehi[1]+1) * old_yprd;
    _orig_hi[2] = domain->boxlo[2] + (_imagehi[2]+1) * old_zprd;

    double _lo[3], _hi[3];

    int num_replicas_added = 0;

    for (ix = 0; ix < nx; ix++) {
      for (iy = 0; iy < ny; iy++) {
        for (iz = 0; iz < nz; iz++) {

          // domain->remap() overwrites coordinates, so always recompute here

          _lo[0] = _orig_lo[0] + ix * old_xprd;
          _hi[0] = _orig_hi[0] + ix * old_xprd;

          _lo[1] = _orig_lo[1] + iy * old_yprd;
          _hi[1] = _orig_hi[1] + iy * old_yprd;

          _lo[2] = _orig_lo[2] + iz * old_zprd;
          _hi[2] = _orig_hi[2] + iz * old_zprd;

          // test if bounding box of shifted replica overlaps sub-domain of proc
          // if not, then skip testing atoms

          int xoverlap = 1;
          int yoverlap = 1;
          int zoverlap = 1;
          if( _lo[0] > (subhi[0] - EPSILON) || _hi[0] < (sublo[0] + EPSILON) ) xoverlap = 0;
          if( _lo[1] > (subhi[1] - EPSILON) || _hi[1] < (sublo[1] + EPSILON) ) yoverlap = 0;
          if( _lo[2] > (subhi[2] - EPSILON) || _hi[2] < (sublo[2] + EPSILON) ) zoverlap = 0;

          int overlap = 0;
          if(xoverlap && yoverlap && zoverlap) overlap = 1;

          // if no overlap, test if bounding box wrapped back into new system

          if(!overlap) {

            // wrap back into cell

            imageint imagelo = ((imageint) IMGMAX << IMG2BITS) |
              ((imageint) IMGMAX << IMGBITS) | IMGMAX;
            domain->remap(&(_lo[0]), imagelo);
            int xboxlo = (imagelo & IMGMASK) - IMGMAX;
            int yboxlo = (imagelo >> IMGBITS & IMGMASK) - IMGMAX;
            int zboxlo = (imagelo >> IMG2BITS) - IMGMAX;

            imageint imagehi = ((imageint) IMGMAX << IMG2BITS) |
              ((imageint) IMGMAX << IMGBITS) | IMGMAX;
            domain->remap(&(_hi[0]), imagehi);
            int xboxhi = (imagehi & IMGMASK) - IMGMAX;
            int yboxhi = (imagehi >> IMGBITS & IMGMASK) - IMGMAX;
            int zboxhi = (imagehi >> IMG2BITS) - IMGMAX;

            // test all fragments for any overlap; ok to include false positives

            int _xoverlap1 = 0;
            int _xoverlap2 = 0;
            if(!xoverlap) {
              if(xboxlo < 0) {
                _xoverlap1 = 1;
                if( _lo[0] > (subhi[0] - EPSILON) ) _xoverlap1 = 0;
              }

              if(xboxhi > 0) {
                _xoverlap2 = 1;
                if( _hi[0] < (sublo[0] + EPSILON) ) _xoverlap2 = 0;
              }

              if(_xoverlap1 || _xoverlap2) xoverlap = 1;
            }

            int _yoverlap1 = 0;
            int _yoverlap2 = 0;
            if(!yoverlap) {
              if(yboxlo < 0) {
                _yoverlap1 = 1;
                if( _lo[1] > (subhi[1] - EPSILON) ) _yoverlap1 = 0;
              }

              if(yboxhi > 0) {
                _yoverlap2 = 1;
                if( _hi[1] < (sublo[1] + EPSILON) ) _yoverlap2 = 0;
              }

              if(_yoverlap1 || _yoverlap2) yoverlap = 1;
            }


            int _zoverlap1 = 0;
            int _zoverlap2 = 0;
            if(!zoverlap) {
              if(zboxlo < 0) {
                _zoverlap1 = 1;
                if( _lo[2] > (subhi[2] - EPSILON) ) _zoverlap1 = 0;
              }

              if(zboxhi > 0) {
                _zoverlap2 = 1;
                if( _hi[2] < (sublo[2] + EPSILON) ) _zoverlap2 = 0;
              }

              if(_zoverlap1 || _zoverlap2) zoverlap = 1;
            }

            // does either fragment overlap w/ sub-domain

            if(xoverlap && yoverlap && zoverlap) overlap = 1;
          }

          // while loop over one proc's atom list

          if(overlap) {
            num_replicas_added++;

            m = 0;
            while (m < size_buf_all) {
              image = ((imageint) IMGMAX << IMG2BITS) |
                ((imageint) IMGMAX << IMGBITS) | IMGMAX;
              if (triclinic == 0) {
                x[0] = buf_all[m+1] + ix*old_xprd;
                x[1] = buf_all[m+2] + iy*old_yprd;
                x[2] = buf_all[m+3] + iz*old_zprd;
              } else {
                x[0] = buf_all[m+1] + ix*old_xprd + iy*old_xy + iz*old_xz;
                x[1] = buf_all[m+2] + iy*old_yprd + iz*old_yz;
                x[2] = buf_all[m+3] + iz*old_zprd;
              }
              domain->remap(x,image);
              if (triclinic) {
                domain->x2lamda(x,lamda);
                coord = lamda;
              } else coord = x;

              if (coord[0] >= sublo[0] && coord[0] < subhi[0] &&
                  coord[1] >= sublo[1] && coord[1] < subhi[1] &&
                  coord[2] >= sublo[2] && coord[2] < subhi[2]) {

                m += avec->unpack_restart(&buf_all[m]);

                i = atom->nlocal - 1;
                if (tag_enable)
                  atom_offset = iz*ny*nx*maxtag + iy*nx*maxtag + ix*maxtag;
                else atom_offset = 0;
                mol_offset = iz*ny*nx*maxmol + iy*nx*maxmol + ix*maxmol;

                atom->x[i][0] = x[0];
                atom->x[i][1] = x[1];
                atom->x[i][2] = x[2];

                atom->tag[i] += atom_offset;
                atom->image[i] = image;

                if (atom->molecular) {
                  if (atom->molecule[i] > 0)
                    atom->molecule[i] += mol_offset;
                  if (atom->molecular == 1) {
                    if (atom->avec->bonds_allow)
                      for (j = 0; j < atom->num_bond[i]; j++)
                        atom->bond_atom[i][j] += atom_offset;
                    if (atom->avec->angles_allow)
                      for (j = 0; j < atom->num_angle[i]; j++) {
                        atom->angle_atom1[i][j] += atom_offset;
                        atom->angle_atom2[i][j] += atom_offset;
                        atom->angle_atom3[i][j] += atom_offset;
                      }
                    if (atom->avec->dihedrals_allow)
                      for (j = 0; j < atom->num_dihedral[i]; j++) {
                        atom->dihedral_atom1[i][j] += atom_offset;
                        atom->dihedral_atom2[i][j] += atom_offset;
                        atom->dihedral_atom3[i][j] += atom_offset;
                        atom->dihedral_atom4[i][j] += atom_offset;
                      }
                    if (atom->avec->impropers_allow)
                      for (j = 0; j < atom->num_improper[i]; j++) {
                        atom->improper_atom1[i][j] += atom_offset;
                        atom->improper_atom2[i][j] += atom_offset;
                        atom->improper_atom3[i][j] += atom_offset;
                        atom->improper_atom4[i][j] += atom_offset;
                      }
                  }
                }
              } else m += static_cast<int> (buf_all[m]);
            }
          } // if(overlap)

        }
      }
    }

    memory->destroy(size_buf_rnk);
    memory->destroy(disp_buf_rnk);
    memory->destroy(buf_all);

    int sum = 0;
    MPI_Reduce(&num_replicas_added, &sum, 1, MPI_INT, MPI_SUM, 0, world);
    double avg = (double) sum / nprocs;
    if (me == 0 && screen)
      fprintf(screen,"Replicate: average # of replicas added to proc= %f out of %i (%f %%)\n",
              avg,nx*ny*nz,avg/(nx*ny*nz)*100.0);

  } else {

    for (int iproc = 0; iproc < nprocs; iproc++) {
      if (me == iproc) {
        n = 0;
@@ -368,6 +653,8 @@ void Replicate::command(int narg, char **arg)
      }
    }

  } // if(use_more_memory)

  // free communication buffer and old atom class

  memory->destroy(buf);