Commit 9de02621 authored by Steve Plimpton's avatar Steve Plimpton
Browse files

added rendezvous via all2all

parent fd813085
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -1576,8 +1576,9 @@ void FixRigidSmall::create_bodies(tagint *bodyID)
  // func = compute bbox of each body, find atom closest to geometric center

  char *buf;
  int nreturn = comm->rendezvous(ncount,proclist,(char *) inbuf,sizeof(InRvous),
                                 rendezvous_body,buf,sizeof(OutRvous),
  int nreturn = comm->rendezvous(1,ncount,(char *) inbuf,sizeof(InRvous),
                                 0,proclist,
                                 rendezvous_body,0,buf,sizeof(OutRvous),
                                 (void *) this);
  OutRvous *outbuf = (OutRvous *) buf;
  
+16 −14
Original line number Diff line number Diff line
@@ -1068,9 +1068,9 @@ void FixShake::atom_owners()
  // each proc assigned every 1/Pth atom
  
  char *buf;
  comm->rendezvous(nlocal,proclist, 
                   (char *) idbuf,sizeof(IDRvous),
                   rendezvous_ids,buf,0,(void *) this);
  comm->rendezvous(1,nlocal,(char *) idbuf,sizeof(IDRvous),
                   0,proclist,
                   rendezvous_ids,0,buf,0,(void *) this);

  memory->destroy(proclist);
  memory->sfree(idbuf);
@@ -1174,9 +1174,10 @@ void FixShake::partner_info(int *npartner, tagint **partner_tag,
  // receives all data needed to populate un-owned partner 4 values

  char *buf;
  int nreturn = comm->rendezvous(nsend,proclist, 
                                 (char *) inbuf,sizeof(PartnerInfo),
                                 rendezvous_partners_info,buf,sizeof(PartnerInfo),
  int nreturn = comm->rendezvous(1,nsend,(char *) inbuf,sizeof(PartnerInfo),
                                 0,proclist,
                                 rendezvous_partners_info,
                                 0,buf,sizeof(PartnerInfo),
                                 (void *) this);
  PartnerInfo *outbuf = (PartnerInfo *) buf;
    
@@ -1194,7 +1195,8 @@ void FixShake::partner_info(int *npartner, tagint **partner_tag,
    partner_type[i][j] = outbuf[m].type;
    partner_massflag[i][j] = outbuf[m].massflag;

    // only set partner_bondtype if my atom did not set it when setting up rendezvous
    // only set partner_bondtype if my atom did not set it
    //   when setting up rendezvous
    // if this proc set it, then sender of this datum set outbuf.bondtype = 0
    
    if (partner_bondtype[i][j] == 0)
@@ -1261,9 +1263,9 @@ void FixShake::nshake_info(int *npartner, tagint **partner_tag,
  // receives all data needed to populate un-owned partner nshake

  char *buf;
  int nreturn = comm->rendezvous(nsend,proclist, 
                                 (char *) inbuf,sizeof(NShakeInfo),
                                 rendezvous_nshake,buf,sizeof(NShakeInfo),
  int nreturn = comm->rendezvous(1,nsend,(char *) inbuf,sizeof(NShakeInfo),
                                 0,proclist,
                                 rendezvous_nshake,0,buf,sizeof(NShakeInfo),
                                 (void *) this);
  NShakeInfo *outbuf = (NShakeInfo *) buf;
    
@@ -1354,9 +1356,9 @@ void FixShake::shake_info(int *npartner, tagint **partner_tag,
  // receives all data needed to populate un-owned shake info

  char *buf;
  int nreturn = comm->rendezvous(nsend,proclist, 
                                 (char *) inbuf,sizeof(ShakeInfo),
                                 rendezvous_shake,buf,sizeof(ShakeInfo),
  int nreturn = comm->rendezvous(1,nsend,(char *) inbuf,sizeof(ShakeInfo),
                                 0,proclist,
                                 rendezvous_shake,0,buf,sizeof(ShakeInfo),
                                 (void *) this);
  ShakeInfo *outbuf = (ShakeInfo *) buf;
    
@@ -1412,7 +1414,7 @@ int FixShake::rendezvous_ids(int n, char *inbuf,
  fsptr->atomIDs = atomIDs;
  fsptr->procowner = procowner;

  // flag = 0: no 2nd irregular comm needed in comm->rendezvous
  // flag = 0: no second comm needed in rendezvous
  
  flag = 0;
  return 0;
+293 −25
Original line number Diff line number Diff line
@@ -729,34 +729,78 @@ void Comm::ring(int n, int nper, void *inbuf, int messtag,
/* ----------------------------------------------------------------------
   rendezvous communication operation
   three stages:
     first Irregular converts inbuf from caller decomp to rvous decomp
     first comm sends inbuf from caller decomp to rvous decomp
     callback operates on data in rendevous decomp
     last Irregular converts outbuf from rvous decomp back to caller decomp
     second comm sends outbuf from rvous decomp back to caller decomp
   inputs:
     n = # of input datums
     proclist = proc that owns each input datum in rendezvous decomposition
     inbuf = list of input datums
     insize = size in bytes of each input datum
     which = perform (0) irregular or (1) MPI_All2allv communication
     n = # of datums in inbuf
     inbuf = vector of input datums
     insize = byte size of each input datum
     inorder = 0 for inbuf in random proc order, 1 for datums ordered by proc
     procs: inorder 0 = proc to send each datum to, 1 = # of datums/proc, 
     callback = caller function to invoke in rendezvous decomposition
                takes input datums, returns output datums
     outorder = same as inorder, but for datums returned by callback()
     ptr = pointer to caller class, passed to callback()
   outputs:
     nout = # of output datums (function return)
     outbuf = list of output datums
     outsize = size in bytes of each output datum
     outbuf = vector of output datums
     outsize = byte size of each output datum
   callback inputs:
     nrvous = # of rvous decomp datums in inbuf_rvous
     inbuf_rvous = vector of rvous decomp input datums
     ptr = pointer to caller class
   callback outputs:
     nrvous_out = # of rvous decomp output datums (function return)
     flag = 0 for no second comm, 1 for outbuf_rvous = inbuf_rvous,
            2 for second comm with new outbuf_rvous
     procs_rvous = outorder 0 = proc to send each datum to, 1 = # of datums/proc
                   allocated
     outbuf_rvous = vector of rvous decomp output datums
   NOTE: could use MPI_INT or MPI_DOUBLE insead of MPI_CHAR
         to avoid checked-for overflow in MPI_Alltoallv?
------------------------------------------------------------------------- */

int Comm::rendezvous(int n, int *proclist, char *inbuf, int insize,
int Comm::
rendezvous(int which, int n, char *inbuf, int insize,
           int inorder, int *procs,
           int (*callback)(int, char *, int &, int *&, char *&, void *),
                     char *&outbuf, int outsize, void *ptr)
           int outorder, char *&outbuf, int outsize, void *ptr)
{
  // comm inbuf from caller decomposition to rendezvous decomposition
  int nout;

  if (which == 0)
    nout = rendezvous_irregular(n,inbuf,insize,inorder,procs,callback,
                                outorder,outbuf,outsize,ptr);
  else
    nout = rendezvous_all2all(n,inbuf,insize,inorder,procs,callback,
                              outorder,outbuf,outsize,ptr);

  return nout;
}

/* ---------------------------------------------------------------------- */

int Comm::
rendezvous_irregular(int n, char *inbuf, int insize, int inorder, int *procs,
                     int (*callback)(int, char *, int &, int *&, char *&, void *),
                     int outorder, char *&outbuf, 
                     int outsize, void *ptr)
{
  // irregular comm of inbuf from caller decomp to rendezvous decomp
  
  Irregular *irregular = new Irregular(lmp);

  int n_rvous = irregular->create_data(n,proclist);  // add sort
  char *inbuf_rvous = (char *) memory->smalloc((bigint) n_rvous*insize,
                                               "rendezvous:inbuf_rvous");
  int nrvous;
  if (inorder) nrvous = irregular->create_data_grouped(n,procs);
  else nrvous = irregular->create_data(n,procs);

  char *inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize,
                                               "rendezvous:inbuf");
  irregular->exchange_data(inbuf,insize,inbuf_rvous);

  bigint irregular1_bytes = 0; //irregular->irregular_bytes;
  irregular->destroy_data();
  delete irregular;

@@ -764,29 +808,253 @@ int Comm::rendezvous(int n, int *proclist, char *inbuf, int insize,
  // callback() allocates/populates proclist_rvous and outbuf_rvous

  int flag;
  int *proclist_rvous;
  int *procs_rvous;
  char *outbuf_rvous;

  int nout_rvous = 
    callback(n_rvous,inbuf_rvous,flag,proclist_rvous,outbuf_rvous,ptr);
  int nrvous_out = callback(nrvous,inbuf_rvous,flag,
                            procs_rvous,outbuf_rvous,ptr);

  if (flag != 1) memory->sfree(inbuf_rvous);  // outbuf_rvous = inbuf_vous
  if (flag == 0) return 0;    // all nout_rvous are 0, no 2nd irregular
  if (flag == 0) return 0;    // all nout_rvous are 0, no 2nd comm stage

  // comm outbuf from rendezvous decomposition back to caller
  // irregular comm of outbuf from rendezvous decomp back to caller decomp
  // caller will free outbuf

  irregular = new Irregular(lmp);
  
  int nout = irregular->create_data(nout_rvous,proclist_rvous);
  outbuf = (char *) memory->smalloc((bigint) nout*outsize,"rendezvous:outbuf");
  int nout;
  if (outorder) 
    nout = irregular->create_data_grouped(nrvous_out,procs_rvous);
  else nout = irregular->create_data(nrvous_out,procs_rvous);

  outbuf = (char *) memory->smalloc((bigint) nout*outsize,
                                    "rendezvous:outbuf");
  irregular->exchange_data(outbuf_rvous,outsize,outbuf);

  bigint irregular2_bytes = 0; //irregular->irregular_bytes;
  irregular->destroy_data();
  delete irregular;
  memory->destroy(proclist_rvous);

  memory->destroy(procs_rvous);
  memory->sfree(outbuf_rvous);

  // approximate memory tally

  bigint rvous_bytes = 0;
  rvous_bytes += n*insize;                                // inbuf
  rvous_bytes += nout*outsize;                            // outbuf
  rvous_bytes += nrvous*insize;                           // inbuf_rvous
  rvous_bytes += nrvous_out*outsize;                      // outbuf_rvous
  rvous_bytes += nrvous_out*sizeof(int);                  // procs_rvous
  rvous_bytes += MAX(irregular1_bytes,irregular2_bytes);  // max of 2 comms

  // return number of output datums

  return nout;
}

/* ---------------------------------------------------------------------- */

int Comm::
rendezvous_all2all(int n, char *inbuf, int insize, int inorder, int *procs,
                   int (*callback)(int, char *, int &, int *&, char *&, void *),
                   int outorder, char *&outbuf, int outsize, void *ptr)
{
  int iproc;
  bigint all2all1_bytes,all2all2_bytes;
  int *sendcount,*sdispls,*recvcount,*rdispls;
  int *procs_a2a;
  bigint *offsets;
  char *inbuf_a2a,*outbuf_a2a;

  // create procs and inbuf for All2all if necesary
  
  if (!inorder) {
    memory->create(procs_a2a,nprocs,"rendezvous:procs");
    inbuf_a2a = (char *) memory->smalloc((bigint) n*insize,
                                         "rendezvous:inbuf");
    memory->create(offsets,nprocs,"rendezvous:offsets");

    for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
    for (int i = 0; i < n; i++) procs_a2a[procs[i]]++;

    offsets[0] = 0;
    for (int i = 1; i < nprocs; i++)
      offsets[i] = offsets[i-1] + insize*procs_a2a[i-1];

    bigint offset = 0;
    for (int i = 0; i < n; i++) {
      iproc = procs[i];
      memcpy(&inbuf_a2a[offsets[iproc]],&inbuf[offset],insize);
      offsets[iproc] += insize;
      offset += insize;
    }

    all2all1_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) + n*insize;

  } else {
    procs_a2a = procs;
    inbuf_a2a = inbuf;
    all2all1_bytes = 0;
  }

  // create args for MPI_Alltoallv() on input data
  
  memory->create(sendcount,nprocs,"rendezvous:sendcount");
  memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
  
  memory->create(recvcount,nprocs,"rendezvous:recvcount");
  MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);

  memory->create(sdispls,nprocs,"rendezvous:sdispls");
  memory->create(rdispls,nprocs,"rendezvous:rdispls");
  sdispls[0] = rdispls[0] = 0;
  for (int i = 1; i < nprocs; i++) {
    sdispls[i] = sdispls[i-1] + sendcount[i-1];
    rdispls[i] = rdispls[i-1] + recvcount[i-1];
  }
  int nrvous = rdispls[nprocs-1] + recvcount[nprocs-1];

  // test for overflow of input data due to imbalance or insize
  // means that individual sdispls or rdispls values overflow
  
  int overflow = 0;
  if ((bigint) n*insize > MAXSMALLINT) overflow = 1;
  if ((bigint) nrvous*insize > MAXSMALLINT) overflow = 1;
  int overflowall;
  MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
  if (overflowall) error->all(FLERR,"Overflow input size in rendezvous_a2a");
  
  for (int i = 0; i < nprocs; i++) {
    sendcount[i] *= insize;
    sdispls[i] *= insize;
    recvcount[i] *= insize;
    rdispls[i] *= insize;
  }

  // all2all comm of inbuf from caller decomp to rendezvous decomp

  char *inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize,
                                               "rendezvous:inbuf");

  MPI_Alltoallv(inbuf_a2a,sendcount,sdispls,MPI_CHAR,
		inbuf_rvous,recvcount,rdispls,MPI_CHAR,world);

  if (!inorder) {
    memory->destroy(procs_a2a);
    memory->sfree(inbuf_a2a);
    memory->destroy(offsets);
  }

  // peform rendezvous computation via callback()
  // callback() allocates/populates proclist_rvous and outbuf_rvous

  int flag;
  int *procs_rvous;
  char *outbuf_rvous;

  int nrvous_out = callback(nrvous,inbuf_rvous,flag,
                            procs_rvous,outbuf_rvous,ptr);

  if (flag != 1) memory->sfree(inbuf_rvous);  // outbuf_rvous = inbuf_vous
  if (flag == 0) return 0;    // all nout_rvous are 0, no 2nd irregular

  // create procs and outbuf for All2all if necesary
  
  if (!outorder) {
    memory->create(procs_a2a,nprocs,"rendezvous_a2a:procs");

    outbuf_a2a = (char *) memory->smalloc((bigint) nrvous_out*outsize,
                                          "rendezvous:outbuf");
    memory->create(offsets,nprocs,"rendezvous:offsets");

    for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
    for (int i = 0; i < nrvous_out; i++) procs_a2a[procs_rvous[i]]++;

    offsets[0] = 0;
    for (int i = 1; i < nprocs; i++)
      offsets[i] = offsets[i-1] + outsize*procs_a2a[i-1];

    bigint offset = 0;
    for (int i = 0; i < nrvous_out; i++) {
      iproc = procs_rvous[i];
      memcpy(&outbuf_a2a[offsets[iproc]],&outbuf_rvous[offset],outsize);
      offsets[iproc] += outsize;
      offset += outsize;
    }
    
    all2all2_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) + 
      nrvous_out*outsize;

  } else {
    procs_a2a = procs_rvous;
    outbuf_a2a = outbuf_rvous;
    all2all2_bytes = 0;
  }

  // comm outbuf from rendezvous decomposition back to caller

  memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
  
  MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
  
  sdispls[0] = rdispls[0] = 0;
  for (int i = 1; i < nprocs; i++) {
    sdispls[i] = sdispls[i-1] + sendcount[i-1];
    rdispls[i] = rdispls[i-1] + recvcount[i-1];
  }
  int nout = rdispls[nprocs-1] + recvcount[nprocs-1];

  // test for overflow of outbuf due to imbalance or outsize
  // means that individual sdispls or rdispls values overflow
  
  overflow = 0;
  if ((bigint) nrvous*outsize > MAXSMALLINT) overflow = 1;
  if ((bigint) nout*outsize > MAXSMALLINT) overflow = 1;
  MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
  if (overflowall) error->all(FLERR,"Overflow output in rendezvous_a2a");
  
  for (int i = 0; i < nprocs; i++) {
    sendcount[i] *= outsize;
    sdispls[i] *= outsize;
    recvcount[i] *= outsize;
    rdispls[i] *= outsize;
  }

  // all2all comm of outbuf from rendezvous decomp back to caller decomp
  // caller will free outbuf

  outbuf = (char *) memory->smalloc((bigint) nout*outsize,"rendezvous:outbuf");

  MPI_Alltoallv(outbuf_a2a,sendcount,sdispls,MPI_CHAR,
		outbuf,recvcount,rdispls,MPI_CHAR,world);

  memory->destroy(procs_rvous);
  memory->sfree(outbuf_rvous);

  if (!outorder) {
    memory->destroy(procs_a2a);
    memory->sfree(outbuf_a2a);
    memory->destroy(offsets);
  }

  // clean up

  memory->destroy(sendcount);
  memory->destroy(recvcount);
  memory->destroy(sdispls);
  memory->destroy(rdispls);

  // approximate memory tally

  bigint rvous_bytes = 0;
  rvous_bytes += n*insize;                                // inbuf
  rvous_bytes += nout*outsize;                            // outbuf
  rvous_bytes += nrvous*insize;                           // inbuf_rvous
  rvous_bytes += nrvous_out*outsize;                      // outbuf_rvous
  rvous_bytes += nrvous_out*sizeof(int);                  // procs_rvous
  rvous_bytes += 4*nprocs*sizeof(int);                    // all2all vectors
  rvous_bytes += MAX(all2all1_bytes,all2all2_bytes);      // reorder ops

  // return number of datums

  return nout;
+10 −2
Original line number Diff line number Diff line
@@ -109,9 +109,9 @@ class Comm : protected Pointers {

  void ring(int, int, void *, int, void (*)(int, char *, void *),
            void *, void *, int self = 1);
  int rendezvous(int, int *, char *, int, 
  int rendezvous(int, int, char *, int, int, int *, 
                 int (*)(int, char *, int &, int *&, char *&, void *), 
                 char *&, int, void *);
                 int, char *&, int, void *);

  int read_lines_from_file(FILE *, int, int, char *);
  int read_lines_from_file_universe(FILE *, int, int, char *);
@@ -146,6 +146,14 @@ class Comm : protected Pointers {
  int ncores;                       // # of cores per node
  int coregrid[3];                  // 3d grid of cores within a node
  int user_coregrid[3];             // user request for cores in each dim

  int rendezvous_irregular(int, char *, int, int, int *, 
                           int (*)(int, char *, int &, int *&, char *&, void *), 
                           int, char *&, int, void *);
  int rendezvous_all2all(int, char *, int, int, int *, 
                         int (*)(int, char *, int &, int *&, char *&, void *), 
                         int, char *&, int, void *);

 public:
  enum{MULTIPLE};
};
+183 −2
Original line number Diff line number Diff line
@@ -622,6 +622,7 @@ int Irregular::create_data(int n, int *proclist, int sortflag)
  num_send = new int[nsend_proc];
  index_send = new int[n-work1[me]];
  index_self = new int[work1[me]];
  maxindex = n;

  // proc_send = procs I send to
  // num_send = # of datums I send to each proc
@@ -679,8 +680,182 @@ int Irregular::create_data(int n, int *proclist, int sortflag)

  // receive incoming messages
  // proc_recv = procs I recv from
  // num_recv = total size of message each proc sends me
  // nrecvdatum = total size of data I recv
  // num_recv = # of datums each proc sends me
  // nrecvdatum = total # of datums I recv

  int nrecvdatum = 0;
  for (i = 0; i < nrecv_proc; i++) {
    MPI_Recv(&num_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
    proc_recv[i] = status->MPI_SOURCE;
    nrecvdatum += num_recv[i];
  }
  nrecvdatum += num_self;

  // sort proc_recv and num_recv by proc ID if requested
  // useful for debugging to insure reproducible ordering of received datums

  if (sortflag) {
    int *order = new int[nrecv_proc];
    int *proc_recv_ordered = new int[nrecv_proc];
    int *num_recv_ordered = new int[nrecv_proc];

    for (i = 0; i < nrecv_proc; i++) order[i] = i;

#if defined(LMP_QSORT)
    proc_recv_copy = proc_recv;
    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
#else
    merge_sort(order,nrecv_proc,(void *)proc_recv,compare_standalone);
#endif

    int j;
    for (i = 0; i < nrecv_proc; i++) {
      j = order[i];
      proc_recv_ordered[i] = proc_recv[j];
      num_recv_ordered[i] = num_recv[j];
    }

    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
    memcpy(num_recv,num_recv_ordered,nrecv_proc*sizeof(int));
    delete [] order;
    delete [] proc_recv_ordered;
    delete [] num_recv_ordered;
  }

  // barrier to insure all MPI_ANY_SOURCE messages are received
  // else another proc could proceed to exchange_data() and send to me

  MPI_Barrier(world);

  // return # of datums I will receive

  return nrecvdatum;
}

/* ----------------------------------------------------------------------
   create communication plan based on list of datums of uniform size
   n = # of datums to send
   procs = how many datums to send to each proc, must include self
   sort = flag for sorting order of received messages by proc ID
   return total # of datums I will recv, including any to self
------------------------------------------------------------------------- */

int Irregular::create_data_grouped(int n, int *procs, int sortflag)
{
  int i,j,k,m;

  // setup for collective comm
  // work1 = # of datums I send to each proc, set self to 0
  // work2 = 1 for all procs, used for ReduceScatter

  for (i = 0; i < nprocs; i++) {
    work1[i] = procs[i];
    work2[i] = 1;
  }
  work1[me] = 0;

  // nrecv_proc = # of procs I receive messages from, not including self
  // options for performing ReduceScatter operation
  // some are more efficient on some machines at big sizes

#ifdef LAMMPS_RS_ALLREDUCE_INPLACE
  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work1[me];
#else 
#ifdef LAMMPS_RS_ALLREDUCE
  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work2[me];
#else
  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
#endif
#endif

  // allocate receive arrays

  proc_recv = new int[nrecv_proc];
  num_recv = new int[nrecv_proc];
  request = new MPI_Request[nrecv_proc];
  status = new MPI_Status[nrecv_proc];

  // work1 = # of datums I send to each proc, including self
  // nsend_proc = # of procs I send messages to, not including self

  for (i = 0; i < nprocs; i++) work1[i] = procs[i];

  nsend_proc = 0;
  for (i = 0; i < nprocs; i++)
    if (work1[i]) nsend_proc++;
  if (work1[me]) nsend_proc--;

  // allocate send and self arrays

  proc_send = new int[nsend_proc];
  num_send = new int[nsend_proc];
  index_send = new int[n-work1[me]];
  index_self = new int[work1[me]];
  maxindex = n;

  // proc_send = procs I send to
  // num_send = # of datums I send to each proc
  // num_self = # of datums I copy to self
  // to balance pattern of send messages:
  //   each proc begins with iproc > me, continues until iproc = me
  // reset work1 to store which send message each proc corresponds to

  int iproc = me;
  int isend = 0;
  for (i = 0; i < nprocs; i++) {
    iproc++;
    if (iproc == nprocs) iproc = 0;
    if (iproc == me) {
      num_self = work1[iproc];
      work1[iproc] = 0;
    } else if (work1[iproc] > 0) {
      proc_send[isend] = iproc;
      num_send[isend] = work1[iproc];
      work1[iproc] = isend;
      isend++;
    }
  }

  // work2 = offsets into index_send for each proc I send to
  // m = ptr into index_self
  // index_send = list of which datums to send to each proc
  //   1st N1 values are datum indices for 1st proc,
  //   next N2 values are datum indices for 2nd proc, etc
  // index_self = list of which datums to copy to self

  work2[0] = 0;
  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];

  m = 0;
  i = 0;
  for (iproc = 0; iproc < nprocs; iproc++) {
    k = procs[iproc];
    for (j = 0; j < k; j++) {
      if (iproc == me) index_self[m++] = i++;
      else {
        isend = work1[iproc];
        index_send[work2[isend]++] = i++;
      }
    }
  }

  // tell receivers how much data I send
  // sendmax_proc = largest # of datums I send in a single message

  sendmax_proc = 0;
  for (i = 0; i < nsend_proc; i++) {
    MPI_Request tmpReq; // Use non-blocking send to avoid possible deadlock
    MPI_Isend(&num_send[i],1,MPI_INT,proc_send[i],0,world,&tmpReq);
    MPI_Request_free(&tmpReq); // the MPI_Barrier below marks completion
    sendmax_proc = MAX(sendmax_proc,num_send[i]);
  }

  // receive incoming messages
  // proc_recv = procs I recv from
  // num_recv = # of datums each proc sends me
  // nrecvdatum = total # of datums I recv

  int nrecvdatum = 0;
  for (i = 0; i < nrecv_proc; i++) {
@@ -789,6 +964,12 @@ void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
  // wait on all incoming messages

  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);

  // approximate memory tally

  bigint irregular_bytes = 2*nprocs*sizeof(int);
  irregular_bytes += maxindex*sizeof(int);
  irregular_bytes += maxbuf;
}

/* ----------------------------------------------------------------------
Loading