Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);

 21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 22: {
 24:   PetscInt       i;

 27:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 28:   for (i=0; i<ov; ++i) {
 29:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 30:   }
 31:   return(0);
 32: }

 34: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
 35: {
 37:   PetscInt       i;

 40:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 41:   for (i=0; i<ov; ++i) {
 42:     MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
 43:   }
 44:   return(0);
 45: }

 47: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
 48: {
 50:   MPI_Comm       comm;
 51:   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
 52:   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
 53:   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
 54:   const PetscInt *indices_i,**indices;
 55:   PetscLayout    rmap;
 56:   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
 57:   PetscSF        sf;
 58:   PetscSFNode    *remote;

 61:   PetscObjectGetComm((PetscObject)mat,&comm);
 62:   MPI_Comm_rank(comm,&rank);
 63:   MPI_Comm_size(comm,&size);
 64:   /* get row map to determine where rows should be going */
 65:   MatGetLayouts(mat,&rmap,NULL);
 66:   /* retrieve IS data and put all together so that we
 67:    * can optimize communication
 68:    *  */
 69:   PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
 70:   for (i=0,tlength=0; i<nidx; i++) {
 71:     ISGetLocalSize(is[i],&length[i]);
 72:     tlength += length[i];
 73:     ISGetIndices(is[i],&indices[i]);
 74:   }
 75:   /* find these rows on remote processors */
 76:   PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
 77:   PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
 78:   nrrows = 0;
 79:   for (i=0; i<nidx; i++) {
 80:     length_i  = length[i];
 81:     indices_i = indices[i];
 82:     for (j=0; j<length_i; j++) {
 83:       owner = -1;
 84:       PetscLayoutFindOwner(rmap,indices_i[j],&owner);
 85:       /* remote processors */
 86:       if (owner != rank) {
 87:         tosizes_temp[owner]++; /* number of rows to owner */
 88:         rrow_ranks[nrrows]  = owner; /* processor */
 89:         rrow_isids[nrrows]   = i; /* is id */
 90:         remoterows[nrrows++] = indices_i[j]; /* row */
 91:       }
 92:     }
 93:     ISRestoreIndices(is[i],&indices[i]);
 94:   }
 95:   PetscFree2(*(PetscInt***)&indices,length);
 96:   /* test if we need to exchange messages
 97:    * generally speaking, we do not need to exchange
 98:    * data when overlap is 1
 99:    * */
100:   MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
101:   /* we do not have any messages
102:    * It usually corresponds to overlap 1
103:    * */
104:   if (!reducednrrows) {
105:     PetscFree3(toranks,tosizes,tosizes_temp);
106:     PetscFree3(remoterows,rrow_ranks,rrow_isids);
107:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
108:     return(0);
109:   }
110:   nto = 0;
111:   /* send sizes and ranks for building a two-sided communcation */
112:   for (i=0; i<size; i++) {
113:     if (tosizes_temp[i]) {
114:       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115:       tosizes_temp[i] = nto; /* a map from processor to index */
116:       toranks[nto++]  = i; /* processor */
117:     }
118:   }
119:   PetscMalloc1(nto+1,&toffsets);
120:   toffsets[0] = 0;
121:   for (i=0; i<nto; i++) {
122:     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123:     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124:   }
125:   /* send information to other processors */
126:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
127:   nrecvrows = 0;
128:   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129:   PetscMalloc1(nrecvrows,&remote);
130:   nrecvrows = 0;
131:   for (i=0; i<nfrom; i++) {
132:     for (j=0; j<fromsizes[2*i]; j++) {
133:       remote[nrecvrows].rank    = fromranks[i];
134:       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135:     }
136:   }
137:   PetscSFCreate(comm,&sf);
138:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
139:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140:   PetscSFSetType(sf,PETSCSFBASIC);
141:   PetscSFSetFromOptions(sf);
142:   /* message pair <no of is, row>  */
143:   PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
144:   for (i=0; i<nrrows; i++) {
145:     owner = rrow_ranks[i]; /* processor */
146:     j     = tosizes_temp[owner]; /* index */
147:     todata[toffsets[j]++] = rrow_isids[i];
148:     todata[toffsets[j]++] = remoterows[i];
149:   }
150:   PetscFree3(toranks,tosizes,tosizes_temp);
151:   PetscFree3(remoterows,rrow_ranks,rrow_isids);
152:   PetscFree(toffsets);
153:   PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
154:   PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
155:   PetscSFDestroy(&sf);
156:   /* send rows belonging to the remote so that then we could get the overlapping data back */
157:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
158:   PetscFree2(todata,fromdata);
159:   PetscFree(fromsizes);
160:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
161:   PetscFree(fromranks);
162:   nrecvrows = 0;
163:   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164:   PetscCalloc1(nrecvrows,&todata);
165:   PetscMalloc1(nrecvrows,&remote);
166:   nrecvrows = 0;
167:   for (i=0; i<nto; i++) {
168:     for (j=0; j<tosizes[2*i]; j++) {
169:       remote[nrecvrows].rank    = toranks[i];
170:       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171:     }
172:   }
173:   PetscSFCreate(comm,&sf);
174:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
175:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176:   PetscSFSetType(sf,PETSCSFBASIC);
177:   PetscSFSetFromOptions(sf);
178:   /* overlap communication and computation */
179:   PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
180:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
181:   PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
182:   PetscSFDestroy(&sf);
183:   PetscFree2(sbdata,sbsizes);
184:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
185:   PetscFree(toranks);
186:   PetscFree(tosizes);
187:   PetscFree(todata);
188:   return(0);
189: }

191: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192: {
193:   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194:   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195:   const PetscInt   *indices_i_temp;
196:   MPI_Comm         *iscomms;
197:   PetscErrorCode    ierr;

200:   max_lsize = 0;
201:   PetscMalloc1(nidx,&isz);
202:   for (i=0; i<nidx; i++) {
203:     ISGetLocalSize(is[i],&lsize);
204:     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205:     isz[i]    = lsize;
206:   }
207:   PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
208:   for (i=0; i<nidx; i++) {
209:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
210:     ISGetIndices(is[i],&indices_i_temp);
211:     PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
212:     ISRestoreIndices(is[i],&indices_i_temp);
213:     ISDestroy(&is[i]);
214:   }
215:   /* retrieve information to get row id and its overlap */
216:   for (i=0; i<nrecvs;) {
217:     is_id     = recvdata[i++];
218:     data_size = recvdata[i++];
219:     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220:     isz_i     = isz[is_id];
221:     for (j=0; j< data_size; j++) {
222:       col = recvdata[i++];
223:       indices_i[isz_i++] = col;
224:     }
225:     isz[is_id] = isz_i;
226:   }
227:   /* remove duplicate entities */
228:   for (i=0; i<nidx; i++) {
229:     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230:     isz_i     = isz[i];
231:     PetscSortRemoveDupsInt(&isz_i,indices_i);
232:     ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
233:     PetscCommDestroy(&iscomms[i]);
234:   }
235:   PetscFree(isz);
236:   PetscFree2(indices_temp,iscomms);
237:   return(0);
238: }

240: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241: {
242:   PetscLayout       rmap,cmap;
243:   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244:   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245:   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246:   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247:   Mat               amat,bmat;
248:   PetscMPIInt       rank;
249:   PetscBool         done;
250:   MPI_Comm          comm;
251:   PetscErrorCode    ierr;

254:   PetscObjectGetComm((PetscObject)mat,&comm);
255:   MPI_Comm_rank(comm,&rank);
256:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
257:   /* Even if the mat is symmetric, we still assume it is not symmetric */
258:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
259:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
261:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262:   /* total number of nonzero values is used to estimate the memory usage in the next step */
263:   tnz  = ai[an]+bi[bn];
264:   MatGetLayouts(mat,&rmap,&cmap);
265:   PetscLayoutGetRange(rmap,&rstart,NULL);
266:   PetscLayoutGetRange(cmap,&cstart,NULL);
267:   /* to find the longest message */
268:   max_fszs = 0;
269:   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270:   /* better way to estimate number of nonzero in the mat??? */
271:   PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
272:   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273:   rows_pos  = 0;
274:   totalrows = 0;
275:   for (i=0; i<nfrom; i++) {
276:     PetscArrayzero(rows_pos_i,nidx);
277:     /* group data together */
278:     for (j=0; j<fromsizes[2*i]; j+=2) {
279:       is_id                       = fromrows[rows_pos++];/* no of is */
280:       rows_i                      = rows_data[is_id];
281:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282:     }
283:     /* estimate a space to avoid multiple allocations  */
284:     for (j=0; j<nidx; j++) {
285:       indvc_ij = 0;
286:       rows_i   = rows_data[j];
287:       for (l=0; l<rows_pos_i[j]; l++) {
288:         row    = rows_i[l]-rstart;
289:         start  = ai[row];
290:         end    = ai[row+1];
291:         for (k=start; k<end; k++) { /* Amat */
292:           col = aj[k] + cstart;
293:           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294:         }
295:         start = bi[row];
296:         end   = bi[row+1];
297:         for (k=start; k<end; k++) { /* Bmat */
298:           col = gcols[bj[k]];
299:           indices_tmp[indvc_ij++] = col;
300:         }
301:       }
302:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
303:       indv_counts[i*nidx+j] = indvc_ij;
304:       totalrows            += indvc_ij;
305:     }
306:   }
307:   /* message triple <no of is, number of rows, rows> */
308:   PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
309:   totalrows = 0;
310:   rows_pos  = 0;
311:   /* use this code again */
312:   for (i=0;i<nfrom;i++) {
313:     PetscArrayzero(rows_pos_i,nidx);
314:     for (j=0; j<fromsizes[2*i]; j+=2) {
315:       is_id                       = fromrows[rows_pos++];
316:       rows_i                      = rows_data[is_id];
317:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318:     }
319:     /* add data  */
320:     for (j=0; j<nidx; j++) {
321:       if (!indv_counts[i*nidx+j]) continue;
322:       indvc_ij = 0;
323:       sbdata[totalrows++] = j;
324:       sbdata[totalrows++] = indv_counts[i*nidx+j];
325:       sbsizes[2*i]       += 2;
326:       rows_i              = rows_data[j];
327:       for (l=0; l<rows_pos_i[j]; l++) {
328:         row   = rows_i[l]-rstart;
329:         start = ai[row];
330:         end   = ai[row+1];
331:         for (k=start; k<end; k++) { /* Amat */
332:           col = aj[k] + cstart;
333:           indices_tmp[indvc_ij++] = col;
334:         }
335:         start = bi[row];
336:         end   = bi[row+1];
337:         for (k=start; k<end; k++) { /* Bmat */
338:           col = gcols[bj[k]];
339:           indices_tmp[indvc_ij++] = col;
340:         }
341:       }
342:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
343:       sbsizes[2*i]  += indvc_ij;
344:       PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
345:       totalrows += indvc_ij;
346:     }
347:   }
348:   PetscMalloc1(nfrom+1,&offsets);
349:   offsets[0] = 0;
350:   for (i=0; i<nfrom; i++) {
351:     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352:     sbsizes[2*i+1] = offsets[i];
353:   }
354:   PetscFree(offsets);
355:   if (sbrowsizes) *sbrowsizes = sbsizes;
356:   if (sbrows) *sbrows = sbdata;
357:   PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
358:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
359:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
360:   return(0);
361: }

363: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364: {
365:   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366:   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367:   PetscInt          lsize,lsize_tmp;
368:   PetscMPIInt       rank,owner;
369:   Mat               amat,bmat;
370:   PetscBool         done;
371:   PetscLayout       cmap,rmap;
372:   MPI_Comm          comm;
373:   PetscErrorCode    ierr;

376:   PetscObjectGetComm((PetscObject)mat,&comm);
377:   MPI_Comm_rank(comm,&rank);
378:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
379:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
380:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
382:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383:   /* is it a safe way to compute number of nonzero values ? */
384:   tnz  = ai[an]+bi[bn];
385:   MatGetLayouts(mat,&rmap,&cmap);
386:   PetscLayoutGetRange(rmap,&rstart,NULL);
387:   PetscLayoutGetRange(cmap,&cstart,NULL);
388:   /* it is a better way to estimate memory than the old implementation
389:    * where global size of matrix is used
390:    * */
391:   PetscMalloc1(tnz,&indices_temp);
392:   for (i=0; i<nidx; i++) {
393:     MPI_Comm iscomm;

395:     ISGetLocalSize(is[i],&lsize);
396:     ISGetIndices(is[i],&indices);
397:     lsize_tmp = 0;
398:     for (j=0; j<lsize; j++) {
399:       owner = -1;
400:       row   = indices[j];
401:       PetscLayoutFindOwner(rmap,row,&owner);
402:       if (owner != rank) continue;
403:       /* local number */
404:       row  -= rstart;
405:       start = ai[row];
406:       end   = ai[row+1];
407:       for (k=start; k<end; k++) { /* Amat */
408:         col = aj[k] + cstart;
409:         indices_temp[lsize_tmp++] = col;
410:       }
411:       start = bi[row];
412:       end   = bi[row+1];
413:       for (k=start; k<end; k++) { /* Bmat */
414:         col = gcols[bj[k]];
415:         indices_temp[lsize_tmp++] = col;
416:       }
417:     }
418:    ISRestoreIndices(is[i],&indices);
419:    PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
420:    ISDestroy(&is[i]);
421:    PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
422:    ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
423:    PetscCommDestroy(&iscomm);
424:   }
425:   PetscFree(indices_temp);
426:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
427:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
428:   return(0);
429: }

431: /*
432:   Sample message format:
433:   If a processor A wants processor B to process some elements corresponding
434:   to index sets is[1],is[5]
435:   mesg [0] = 2   (no of index sets in the mesg)
436:   -----------
437:   mesg [1] = 1 => is[1]
438:   mesg [2] = sizeof(is[1]);
439:   -----------
440:   mesg [3] = 5  => is[5]
441:   mesg [4] = sizeof(is[5]);
442:   -----------
443:   mesg [5]
444:   mesg [n]  datas[1]
445:   -----------
446:   mesg[n+1]
447:   mesg[m]  data(is[5])
448:   -----------

450:   Notes:
451:   nrqs - no of requests sent (or to be sent out)
452:   nrqr - no of requests received (which have to be or which have been processed)
453: */
454: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455: {
456:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458:   const PetscInt **idx,*idx_i;
459:   PetscInt       *n,**data,len;
460: #if defined(PETSC_USE_CTABLE)
461:   PetscTable     *table_data,table_data_i;
462:   PetscInt       *tdata,tcount,tcount_max;
463: #else
464:   PetscInt       *data_i,*d_p;
465: #endif
467:   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468:   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470:   PetscBT        *table;
471:   MPI_Comm       comm;
472:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473:   MPI_Status     *recv_status;
474:   MPI_Comm       *iscomms;
475:   char           *t_p;

478:   PetscObjectGetComm((PetscObject)C,&comm);
479:   size = c->size;
480:   rank = c->rank;
481:   M    = C->rmap->N;

483:   PetscObjectGetNewTag((PetscObject)C,&tag1);
484:   PetscObjectGetNewTag((PetscObject)C,&tag2);

486:   PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);

488:   for (i=0; i<imax; i++) {
489:     ISGetIndices(is[i],&idx[i]);
490:     ISGetLocalSize(is[i],&n[i]);
491:   }

493:   /* evaluate communication - mesg to who,length of mesg, and buffer space
494:      required. Based on this, buffers are allocated, and data copied into them  */
495:   PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
496:   for (i=0; i<imax; i++) {
497:     PetscArrayzero(w4,size); /* initialise work vector*/
498:     idx_i = idx[i];
499:     len   = n[i];
500:     for (j=0; j<len; j++) {
501:       row = idx_i[j];
502:       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503:       PetscLayoutFindOwner(C->rmap,row,&proc);
504:       w4[proc]++;
505:     }
506:     for (j=0; j<size; j++) {
507:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508:     }
509:   }

511:   nrqs     = 0;              /* no of outgoing messages */
512:   msz      = 0;              /* total mesg length (for all proc */
513:   w1[rank] = 0;              /* no mesg sent to intself */
514:   w3[rank] = 0;
515:   for (i=0; i<size; i++) {
516:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517:   }
518:   /* pa - is list of processors to communicate with */
519:   PetscMalloc1(nrqs,&pa);
520:   for (i=0,j=0; i<size; i++) {
521:     if (w1[i]) {pa[j] = i; j++;}
522:   }

524:   /* Each message would have a header = 1 + 2*(no of IS) + data */
525:   for (i=0; i<nrqs; i++) {
526:     j      = pa[i];
527:     w1[j] += w2[j] + 2*w3[j];
528:     msz   += w1[j];
529:   }

531:   /* Determine the number of messages to expect, their lengths, from from-ids */
532:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
533:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

535:   /* Now post the Irecvs corresponding to these messages */
536:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

538:   /* Allocate Memory for outgoing messages */
539:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
540:   PetscArrayzero(outdat,size);
541:   PetscArrayzero(ptr,size);

543:   {
544:     PetscInt *iptr = tmp,ict  = 0;
545:     for (i=0; i<nrqs; i++) {
546:       j         = pa[i];
547:       iptr     +=  ict;
548:       outdat[j] = iptr;
549:       ict       = w1[j];
550:     }
551:   }

553:   /* Form the outgoing messages */
554:   /* plug in the headers */
555:   for (i=0; i<nrqs; i++) {
556:     j            = pa[i];
557:     outdat[j][0] = 0;
558:     PetscArrayzero(outdat[j]+1,2*w3[j]);
559:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560:   }

562:   /* Memory for doing local proc's work */
563:   {
564:     PetscInt M_BPB_imax = 0;
565: #if defined(PETSC_USE_CTABLE)
566:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
567:     PetscMalloc1(imax,&table_data);
568:     for (i=0; i<imax; i++) {
569:       PetscTableCreate(n[i],M,&table_data[i]);
570:     }
571:     PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
572:     for (i=0; i<imax; i++) {
573:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574:     }
575: #else
576:     PetscInt Mimax = 0;
577:     PetscIntMultError(M,imax, &Mimax);
578:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
579:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
580:     for (i=0; i<imax; i++) {
581:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582:       data[i]  = d_p + M*i;
583:     }
584: #endif
585:   }

587:   /* Parse the IS and update local tables and the outgoing buf with the data */
588:   {
589:     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590:     PetscBT  table_i;

592:     for (i=0; i<imax; i++) {
593:       PetscArrayzero(ctr,size);
594:       n_i     = n[i];
595:       table_i = table[i];
596:       idx_i   = idx[i];
597: #if defined(PETSC_USE_CTABLE)
598:       table_data_i = table_data[i];
599: #else
600:       data_i  = data[i];
601: #endif
602:       isz_i   = isz[i];
603:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604:         row  = idx_i[j];
605:         PetscLayoutFindOwner(C->rmap,row,&proc);
606:         if (proc != rank) { /* copy to the outgoing buffer */
607:           ctr[proc]++;
608:           *ptr[proc] = row;
609:           ptr[proc]++;
610:         } else if (!PetscBTLookupSet(table_i,row)) {
611: #if defined(PETSC_USE_CTABLE)
612:           PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
613: #else
614:           data_i[isz_i] = row; /* Update the local table */
615: #endif
616:           isz_i++;
617:         }
618:       }
619:       /* Update the headers for the current IS */
620:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621:         if ((ctr_j = ctr[j])) {
622:           outdat_j        = outdat[j];
623:           k               = ++outdat_j[0];
624:           outdat_j[2*k]   = ctr_j;
625:           outdat_j[2*k-1] = i;
626:         }
627:       }
628:       isz[i] = isz_i;
629:     }
630:   }

632:   /*  Now  post the sends */
633:   PetscMalloc1(nrqs,&s_waits1);
634:   for (i=0; i<nrqs; ++i) {
635:     j    = pa[i];
636:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
637:   }

639:   /* No longer need the original indices */
640:   PetscMalloc1(imax,&iscomms);
641:   for (i=0; i<imax; ++i) {
642:     ISRestoreIndices(is[i],idx+i);
643:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
644:   }
645:   PetscFree2(*(PetscInt***)&idx,n);

647:   for (i=0; i<imax; ++i) {
648:     ISDestroy(&is[i]);
649:   }

651:   /* Do Local work */
652: #if defined(PETSC_USE_CTABLE)
653:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
654: #else
655:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
656: #endif

658:   /* Receive messages */
659:   PetscMalloc1(nrqr,&recv_status);
660:   MPI_Waitall(nrqr,r_waits1,recv_status);
661:   MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);

663:   /* Phase 1 sends are complete - deallocate buffers */
664:   PetscFree4(outdat,ptr,tmp,ctr);
665:   PetscFree4(w1,w2,w3,w4);

667:   PetscMalloc1(nrqr,&xdata);
668:   PetscMalloc1(nrqr,&isz1);
669:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
670:   PetscFree(rbuf[0]);
671:   PetscFree(rbuf);

673:   /* Send the data back */
674:   /* Do a global reduction to know the buffer space req for incoming messages */
675:   {
676:     PetscMPIInt *rw1;

678:     PetscCalloc1(size,&rw1);

680:     for (i=0; i<nrqr; ++i) {
681:       proc = recv_status[i].MPI_SOURCE;

683:       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684:       rw1[proc] = isz1[i];
685:     }
686:     PetscFree(onodes1);
687:     PetscFree(olengths1);

689:     /* Determine the number of messages to expect, their lengths, from from-ids */
690:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
691:     PetscFree(rw1);
692:   }
693:   /* Now post the Irecvs corresponding to these messages */
694:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

696:   /* Now  post the sends */
697:   PetscMalloc1(nrqr,&s_waits2);
698:   for (i=0; i<nrqr; ++i) {
699:     j    = recv_status[i].MPI_SOURCE;
700:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
701:   }

703:   /* receive work done on other processors */
704:   {
705:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706:     PetscMPIInt idex;
707:     PetscBT     table_i;

709:     for (i=0; i<nrqs; ++i) {
710:       MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
711:       /* Process the message */
712:       rbuf2_i = rbuf2[idex];
713:       ct1     = 2*rbuf2_i[0]+1;
714:       jmax    = rbuf2[idex][0];
715:       for (j=1; j<=jmax; j++) {
716:         max     = rbuf2_i[2*j];
717:         is_no   = rbuf2_i[2*j-1];
718:         isz_i   = isz[is_no];
719:         table_i = table[is_no];
720: #if defined(PETSC_USE_CTABLE)
721:         table_data_i = table_data[is_no];
722: #else
723:         data_i  = data[is_no];
724: #endif
725:         for (k=0; k<max; k++,ct1++) {
726:           row = rbuf2_i[ct1];
727:           if (!PetscBTLookupSet(table_i,row)) {
728: #if defined(PETSC_USE_CTABLE)
729:             PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
730: #else
731:             data_i[isz_i] = row;
732: #endif
733:             isz_i++;
734:           }
735:         }
736:         isz[is_no] = isz_i;
737:       }
738:     }

740:     MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
741:   }

743: #if defined(PETSC_USE_CTABLE)
744:   tcount_max = 0;
745:   for (i=0; i<imax; ++i) {
746:     table_data_i = table_data[i];
747:     PetscTableGetCount(table_data_i,&tcount);
748:     if (tcount_max < tcount) tcount_max = tcount;
749:   }
750:   PetscMalloc1(tcount_max+1,&tdata);
751: #endif

753:   for (i=0; i<imax; ++i) {
754: #if defined(PETSC_USE_CTABLE)
755:     PetscTablePosition tpos;
756:     table_data_i = table_data[i];

758:     PetscTableGetHeadPosition(table_data_i,&tpos);
759:     while (tpos) {
760:       PetscTableGetNext(table_data_i,&tpos,&k,&j);
761:       tdata[--j] = --k;
762:     }
763:     ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
764: #else
765:     ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
766: #endif
767:     PetscCommDestroy(&iscomms[i]);
768:   }

770:   PetscFree(iscomms);
771:   PetscFree(onodes2);
772:   PetscFree(olengths2);

774:   PetscFree(pa);
775:   PetscFree(rbuf2[0]);
776:   PetscFree(rbuf2);
777:   PetscFree(s_waits1);
778:   PetscFree(r_waits1);
779:   PetscFree(s_waits2);
780:   PetscFree(r_waits2);
781:   PetscFree(recv_status);
782:   if (xdata) {
783:     PetscFree(xdata[0]);
784:     PetscFree(xdata);
785:   }
786:   PetscFree(isz1);
787: #if defined(PETSC_USE_CTABLE)
788:   for (i=0; i<imax; i++) {
789:     PetscTableDestroy((PetscTable*)&table_data[i]);
790:   }
791:   PetscFree(table_data);
792:   PetscFree(tdata);
793:   PetscFree4(table,data,isz,t_p);
794: #else
795:   PetscFree5(table,data,isz,d_p,t_p);
796: #endif
797:   return(0);
798: }

800: /*
801:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802:        the work on the local processor.

804:      Inputs:
805:       C      - MAT_MPIAIJ;
806:       imax - total no of index sets processed at a time;
807:       table  - an array of char - size = m bits.

809:      Output:
810:       isz    - array containing the count of the solution elements corresponding
811:                to each index set;
812:       data or table_data  - pointer to the solutions
813: */
814: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815: {
816:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817:   Mat        A  = c->A,B = c->B;
818:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
820:   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
821:   PetscBT    table_i;
822: #if defined(PETSC_USE_CTABLE)
823:   PetscTable         table_data_i;
824:   PetscErrorCode     ierr;
825:   PetscTablePosition tpos;
826:   PetscInt           tcount,*tdata;
827: #else
828:   PetscInt           *data_i;
829: #endif

832:   rstart = C->rmap->rstart;
833:   cstart = C->cmap->rstart;
834:   ai     = a->i;
835:   aj     = a->j;
836:   bi     = b->i;
837:   bj     = b->j;
838:   garray = c->garray;

840:   for (i=0; i<imax; i++) {
841: #if defined(PETSC_USE_CTABLE)
842:     /* copy existing entries of table_data_i into tdata[] */
843:     table_data_i = table_data[i];
844:     PetscTableGetCount(table_data_i,&tcount);
845:     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);

847:     PetscMalloc1(tcount,&tdata);
848:     PetscTableGetHeadPosition(table_data_i,&tpos);
849:     while (tpos) {
850:       PetscTableGetNext(table_data_i,&tpos,&row,&j);
851:       tdata[--j] = --row;
852:       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
853:     }
854: #else
855:     data_i  = data[i];
856: #endif
857:     table_i = table[i];
858:     isz_i   = isz[i];
859:     max     = isz[i];

861:     for (j=0; j<max; j++) {
862: #if defined(PETSC_USE_CTABLE)
863:       row   = tdata[j] - rstart;
864: #else
865:       row   = data_i[j] - rstart;
866: #endif
867:       start = ai[row];
868:       end   = ai[row+1];
869:       for (k=start; k<end; k++) { /* Amat */
870:         val = aj[k] + cstart;
871:         if (!PetscBTLookupSet(table_i,val)) {
872: #if defined(PETSC_USE_CTABLE)
873:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
874: #else
875:           data_i[isz_i] = val;
876: #endif
877:           isz_i++;
878:         }
879:       }
880:       start = bi[row];
881:       end   = bi[row+1];
882:       for (k=start; k<end; k++) { /* Bmat */
883:         val = garray[bj[k]];
884:         if (!PetscBTLookupSet(table_i,val)) {
885: #if defined(PETSC_USE_CTABLE)
886:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
887: #else
888:           data_i[isz_i] = val;
889: #endif
890:           isz_i++;
891:         }
892:       }
893:     }
894:     isz[i] = isz_i;

896: #if defined(PETSC_USE_CTABLE)
897:     PetscFree(tdata);
898: #endif
899:   }
900:   return(0);
901: }

903: /*
904:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905:          and return the output

907:          Input:
908:            C    - the matrix
909:            nrqr - no of messages being processed.
910:            rbuf - an array of pointers to the received requests

912:          Output:
913:            xdata - array of messages to be sent back
914:            isz1  - size of each message

916:   For better efficiency perhaps we should malloc separately each xdata[i],
917: then if a remalloc is required we need only copy the data for that one row
918: rather then all previous rows as it is now where a single large chunk of
919: memory is used.

921: */
922: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923: {
924:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
925:   Mat            A  = c->A,B = c->B;
926:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
928:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931:   PetscInt       *rbuf_i,kmax,rbuf_0;
932:   PetscBT        xtable;

935:   m      = C->rmap->N;
936:   rstart = C->rmap->rstart;
937:   cstart = C->cmap->rstart;
938:   ai     = a->i;
939:   aj     = a->j;
940:   bi     = b->i;
941:   bj     = b->j;
942:   garray = c->garray;

944:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945:     rbuf_i =  rbuf[i];
946:     rbuf_0 =  rbuf_i[0];
947:     ct    += rbuf_0;
948:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949:   }

951:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952:   else max1 = 1;
953:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954:   if (nrqr) {
955:     PetscMalloc1(mem_estimate,&xdata[0]);
956:     ++no_malloc;
957:   }
958:   PetscBTCreate(m,&xtable);
959:   PetscArrayzero(isz1,nrqr);

961:   ct3 = 0;
962:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963:     rbuf_i =  rbuf[i];
964:     rbuf_0 =  rbuf_i[0];
965:     ct1    =  2*rbuf_0+1;
966:     ct2    =  ct1;
967:     ct3   += ct1;
968:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969:       PetscBTMemzero(m,xtable);
970:       oct2 = ct2;
971:       kmax = rbuf_i[2*j];
972:       for (k=0; k<kmax; k++,ct1++) {
973:         row = rbuf_i[ct1];
974:         if (!PetscBTLookupSet(xtable,row)) {
975:           if (!(ct3 < mem_estimate)) {
976:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977:             PetscMalloc1(new_estimate,&tmp);
978:             PetscArraycpy(tmp,xdata[0],mem_estimate);
979:             PetscFree(xdata[0]);
980:             xdata[0]     = tmp;
981:             mem_estimate = new_estimate; ++no_malloc;
982:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983:           }
984:           xdata[i][ct2++] = row;
985:           ct3++;
986:         }
987:       }
988:       for (k=oct2,max2=ct2; k<max2; k++) {
989:         row   = xdata[i][k] - rstart;
990:         start = ai[row];
991:         end   = ai[row+1];
992:         for (l=start; l<end; l++) {
993:           val = aj[l] + cstart;
994:           if (!PetscBTLookupSet(xtable,val)) {
995:             if (!(ct3 < mem_estimate)) {
996:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997:               PetscMalloc1(new_estimate,&tmp);
998:               PetscArraycpy(tmp,xdata[0],mem_estimate);
999:               PetscFree(xdata[0]);
1000:               xdata[0]     = tmp;
1001:               mem_estimate = new_estimate; ++no_malloc;
1002:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003:             }
1004:             xdata[i][ct2++] = val;
1005:             ct3++;
1006:           }
1007:         }
1008:         start = bi[row];
1009:         end   = bi[row+1];
1010:         for (l=start; l<end; l++) {
1011:           val = garray[bj[l]];
1012:           if (!PetscBTLookupSet(xtable,val)) {
1013:             if (!(ct3 < mem_estimate)) {
1014:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015:               PetscMalloc1(new_estimate,&tmp);
1016:               PetscArraycpy(tmp,xdata[0],mem_estimate);
1017:               PetscFree(xdata[0]);
1018:               xdata[0]     = tmp;
1019:               mem_estimate = new_estimate; ++no_malloc;
1020:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021:             }
1022:             xdata[i][ct2++] = val;
1023:             ct3++;
1024:           }
1025:         }
1026:       }
1027:       /* Update the header*/
1028:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029:       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030:     }
1031:     xdata[i][0] = rbuf_0;
1032:     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1033:     isz1[i]     = ct2; /* size of each message */
1034:   }
1035:   PetscBTDestroy(&xtable);
1036:   PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1037:   return(0);
1038: }
1039: /* -------------------------------------------------------------------------*/
1040: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041: /*
1042:     Every processor gets the entire matrix
1043: */
1044: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045: {
1046:   Mat            B;
1047:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1050:   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1051:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;

1055:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1056:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1057:   if (scall == MAT_INITIAL_MATRIX) {
1058:     /* ----------------------------------------------------------------
1059:          Tell every processor the number of nonzeros per row
1060:     */
1061:     PetscMalloc1(A->rmap->N,&lens);
1062:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064:     }
1065:     PetscMalloc2(size,&recvcounts,size,&displs);
1066:     for (i=0; i<size; i++) {
1067:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068:       displs[i]     = A->rmap->range[i];
1069:     }
1070: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1071:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1072: #else
1073:     sendcount = A->rmap->rend - A->rmap->rstart;
1074:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1075: #endif
1076:     /* ---------------------------------------------------------------
1077:          Create the sequential matrix of the same type as the local block diagonal
1078:     */
1079:     MatCreate(PETSC_COMM_SELF,&B);
1080:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1081:     MatSetBlockSizesFromMats(B,A,A);
1082:     MatSetType(B,((PetscObject)a->A)->type_name);
1083:     MatSeqAIJSetPreallocation(B,0,lens);
1084:     PetscCalloc1(2,Bin);
1085:     **Bin = B;
1086:     b     = (Mat_SeqAIJ*)B->data;

1088:     /*--------------------------------------------------------------------
1089:        Copy my part of matrix column indices over
1090:     */
1091:     sendcount  = ad->nz + bd->nz;
1092:     jsendbuf   = b->j + b->i[rstarts[rank]];
1093:     a_jsendbuf = ad->j;
1094:     b_jsendbuf = bd->j;
1095:     n          = A->rmap->rend - A->rmap->rstart;
1096:     cnt        = 0;
1097:     for (i=0; i<n; i++) {
1098:       /* put in lower diagonal portion */
1099:       m = bd->i[i+1] - bd->i[i];
1100:       while (m > 0) {
1101:         /* is it above diagonal (in bd (compressed) numbering) */
1102:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104:         m--;
1105:       }

1107:       /* put in diagonal portion */
1108:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110:       }

1112:       /* put in upper diagonal portion */
1113:       while (m-- > 0) {
1114:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115:       }
1116:     }
1117:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1119:     /*--------------------------------------------------------------------
1120:        Gather all column indices to all processors
1121:     */
1122:     for (i=0; i<size; i++) {
1123:       recvcounts[i] = 0;
1124:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125:         recvcounts[i] += lens[j];
1126:       }
1127:     }
1128:     displs[0] = 0;
1129:     for (i=1; i<size; i++) {
1130:       displs[i] = displs[i-1] + recvcounts[i-1];
1131:     }
1132: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1134: #else
1135:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1136: #endif
1137:     /*--------------------------------------------------------------------
1138:         Assemble the matrix into useable form (note numerical values not yet set)
1139:     */
1140:     /* set the b->ilen (length of each row) values */
1141:     PetscArraycpy(b->ilen,lens,A->rmap->N);
1142:     /* set the b->i indices */
1143:     b->i[0] = 0;
1144:     for (i=1; i<=A->rmap->N; i++) {
1145:       b->i[i] = b->i[i-1] + lens[i-1];
1146:     }
1147:     PetscFree(lens);
1148:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1149:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

1151:   } else {
1152:     B = **Bin;
1153:     b = (Mat_SeqAIJ*)B->data;
1154:   }

1156:   /*--------------------------------------------------------------------
1157:        Copy my part of matrix numerical values into the values location
1158:   */
1159:   if (flag == MAT_GET_VALUES) {
1160:     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1161:     MatScalar         *sendbuf,*recvbuf;

1163:     MatSeqAIJGetArrayRead(a->A,&ada);
1164:     MatSeqAIJGetArrayRead(a->B,&bda);
1165:     sendcount = ad->nz + bd->nz;
1166:     sendbuf   = b->a + b->i[rstarts[rank]];
1167:     a_sendbuf = ada;
1168:     b_sendbuf = bda;
1169:     b_sendj   = bd->j;
1170:     n         = A->rmap->rend - A->rmap->rstart;
1171:     cnt       = 0;
1172:     for (i=0; i<n; i++) {
1173:       /* put in lower diagonal portion */
1174:       m = bd->i[i+1] - bd->i[i];
1175:       while (m > 0) {
1176:         /* is it above diagonal (in bd (compressed) numbering) */
1177:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1178:         sendbuf[cnt++] = *b_sendbuf++;
1179:         m--;
1180:         b_sendj++;
1181:       }

1183:       /* put in diagonal portion */
1184:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1185:         sendbuf[cnt++] = *a_sendbuf++;
1186:       }

1188:       /* put in upper diagonal portion */
1189:       while (m-- > 0) {
1190:         sendbuf[cnt++] = *b_sendbuf++;
1191:         b_sendj++;
1192:       }
1193:     }
1194:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1196:     /* -----------------------------------------------------------------
1197:        Gather all numerical values to all processors
1198:     */
1199:     if (!recvcounts) {
1200:       PetscMalloc2(size,&recvcounts,size,&displs);
1201:     }
1202:     for (i=0; i<size; i++) {
1203:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1204:     }
1205:     displs[0] = 0;
1206:     for (i=1; i<size; i++) {
1207:       displs[i] = displs[i-1] + recvcounts[i-1];
1208:     }
1209:     recvbuf = b->a;
1210: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1211:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1212: #else
1213:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1214: #endif
1215:     MatSeqAIJRestoreArrayRead(a->A,&ada);
1216:     MatSeqAIJRestoreArrayRead(a->B,&bda);
1217:   }  /* endof (flag == MAT_GET_VALUES) */
1218:   PetscFree2(recvcounts,displs);

1220:   if (A->symmetric) {
1221:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1222:   } else if (A->hermitian) {
1223:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1224:   } else if (A->structurally_symmetric) {
1225:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1226:   }
1227:   return(0);
1228: }

1230: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1231: {
1232:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1233:   Mat            submat,A = c->A,B = c->B;
1234:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1235:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1236:   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1237:   const PetscInt *icol,*irow;
1238:   PetscInt       nrow,ncol,start;
1240:   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1241:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1242:   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1243:   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1244:   PetscInt       *lens,rmax,ncols,*cols,Crow;
1245: #if defined(PETSC_USE_CTABLE)
1246:   PetscTable     cmap,rmap;
1247:   PetscInt       *cmap_loc,*rmap_loc;
1248: #else
1249:   PetscInt       *cmap,*rmap;
1250: #endif
1251:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1252:   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1253:   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1254:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1255:   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1256:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1257:   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1258:   MPI_Comm       comm;
1259:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1260:   PetscMPIInt    *onodes1,*olengths1,idex,end;
1261:   Mat_SubSppt    *smatis1;
1262:   PetscBool      isrowsorted,iscolsorted;

1267:   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268:   MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1269:   MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1270:   PetscObjectGetComm((PetscObject)C,&comm);
1271:   size = c->size;
1272:   rank = c->rank;

1274:   ISSorted(iscol[0],&iscolsorted);
1275:   ISSorted(isrow[0],&isrowsorted);
1276:   ISGetIndices(isrow[0],&irow);
1277:   ISGetLocalSize(isrow[0],&nrow);
1278:   if (allcolumns) {
1279:     icol = NULL;
1280:     ncol = C->cmap->N;
1281:   } else {
1282:     ISGetIndices(iscol[0],&icol);
1283:     ISGetLocalSize(iscol[0],&ncol);
1284:   }

1286:   if (scall == MAT_INITIAL_MATRIX) {
1287:     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;

1289:     /* Get some new tags to keep the communication clean */
1290:     tag1 = ((PetscObject)C)->tag;
1291:     PetscObjectGetNewTag((PetscObject)C,&tag2);
1292:     PetscObjectGetNewTag((PetscObject)C,&tag3);

1294:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1295:      required. Based on this, buffers are allocated, and data copied into them */
1296:     PetscCalloc2(size,&w1,size,&w2);
1297:     PetscMalloc1(nrow,&row2proc);

1299:     /* w1[proc] = num of rows owned by proc -- to be requested */
1300:     proc = 0;
1301:     nrqs = 0; /* num of outgoing messages */
1302:     for (j=0; j<nrow; j++) {
1303:       row  = irow[j];
1304:       if (!isrowsorted) proc = 0;
1305:       while (row >= C->rmap->range[proc+1]) proc++;
1306:       w1[proc]++;
1307:       row2proc[j] = proc; /* map row index to proc */

1309:       if (proc != rank && !w2[proc]) {
1310:         w2[proc] = 1; nrqs++;
1311:       }
1312:     }
1313:     w1[rank] = 0;  /* rows owned by self will not be requested */

1315:     PetscMalloc1(nrqs,&pa); /*(proc -array)*/
1316:     for (proc=0,j=0; proc<size; proc++) {
1317:       if (w1[proc]) { pa[j++] = proc;}
1318:     }

1320:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321:     msz = 0;              /* total mesg length (for all procs) */
1322:     for (i=0; i<nrqs; i++) {
1323:       proc      = pa[i];
1324:       w1[proc] += 3;
1325:       msz      += w1[proc];
1326:     }
1327:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

1329:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);

1333:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

1337:     /* Now post the Irecvs corresponding to these messages */
1338:     PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

1340:     PetscFree(onodes1);
1341:     PetscFree(olengths1);

1343:     /* Allocate Memory for outgoing messages */
1344:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1345:     PetscArrayzero(sbuf1,size);
1346:     PetscArrayzero(ptr,size);

1348:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349:     iptr = tmp;
1350:     for (i=0; i<nrqs; i++) {
1351:       proc        = pa[i];
1352:       sbuf1[proc] = iptr;
1353:       iptr       += w1[proc];
1354:     }

1356:     /* Form the outgoing messages */
1357:     /* Initialize the header space */
1358:     for (i=0; i<nrqs; i++) {
1359:       proc      = pa[i];
1360:       PetscArrayzero(sbuf1[proc],3);
1361:       ptr[proc] = sbuf1[proc] + 3;
1362:     }

1364:     /* Parse the isrow and copy data into outbuf */
1365:     PetscArrayzero(ctr,size);
1366:     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1367:       proc = row2proc[j];
1368:       if (proc != rank) { /* copy to the outgoing buf*/
1369:         *ptr[proc] = irow[j];
1370:         ctr[proc]++; ptr[proc]++;
1371:       }
1372:     }

1374:     /* Update the headers for the current IS */
1375:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376:       if ((ctr_j = ctr[j])) {
1377:         sbuf1_j        = sbuf1[j];
1378:         k              = ++sbuf1_j[0];
1379:         sbuf1_j[2*k]   = ctr_j;
1380:         sbuf1_j[2*k-1] = 0;
1381:       }
1382:     }

1384:     /* Now post the sends */
1385:     PetscMalloc1(nrqs,&s_waits1);
1386:     for (i=0; i<nrqs; ++i) {
1387:       proc = pa[i];
1388:       MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1389:     }

1391:     /* Post Receives to capture the buffer size */
1392:     PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);
1393:     PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);

1395:     if (nrqs) rbuf2[0] = tmp + msz;
1396:     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];

1398:     for (i=0; i<nrqs; ++i) {
1399:       proc = pa[i];
1400:       MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1401:     }

1403:     PetscFree2(w1,w2);

1405:     /* Send to other procs the buf size they should allocate */
1406:     /* Receive messages*/
1407:     PetscMalloc1(nrqr,&r_status1);
1408:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);

1410:     MPI_Waitall(nrqr,r_waits1,r_status1);
1411:     for (i=0; i<nrqr; ++i) {
1412:       req_size[i] = 0;
1413:       rbuf1_i        = rbuf1[i];
1414:       start          = 2*rbuf1_i[0] + 1;
1415:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
1416:       PetscMalloc1(end,&sbuf2[i]);
1417:       sbuf2_i        = sbuf2[i];
1418:       for (j=start; j<end; j++) {
1419:         k            = rbuf1_i[j] - rstart;
1420:         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421:         sbuf2_i[j]   = ncols;
1422:         req_size[i] += ncols;
1423:       }
1424:       req_source1[i] = r_status1[i].MPI_SOURCE;

1426:       /* form the header */
1427:       sbuf2_i[0] = req_size[i];
1428:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1430:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1431:     }

1433:     PetscFree(r_status1);
1434:     PetscFree(r_waits1);

1436:     /* rbuf2 is received, Post recv column indices a->j */
1437:     MPI_Waitall(nrqs,r_waits2,r_status2);

1439:     PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);
1440:     for (i=0; i<nrqs; ++i) {
1441:       PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
1442:       req_source2[i] = r_status2[i].MPI_SOURCE;
1443:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1444:     }

1446:     /* Wait on sends1 and sends2 */
1447:     PetscMalloc1(nrqs,&s_status1);
1448:     MPI_Waitall(nrqs,s_waits1,s_status1);
1449:     PetscFree(s_waits1);
1450:     PetscFree(s_status1);

1452:     MPI_Waitall(nrqr,s_waits2,s_status2);
1453:     PetscFree4(r_status2,s_waits2,r_waits2,s_status2);

1455:     /* Now allocate sending buffers for a->j, and send them off */
1456:     PetscMalloc1(nrqr,&sbuf_aj);
1457:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458:     if (nrqr) {PetscMalloc1(j,&sbuf_aj[0]);}
1459:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1461:     for (i=0; i<nrqr; i++) { /* for each requested message */
1462:       rbuf1_i   = rbuf1[i];
1463:       sbuf_aj_i = sbuf_aj[i];
1464:       ct1       = 2*rbuf1_i[0] + 1;
1465:       ct2       = 0;
1466:       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */

1468:       kmax = rbuf1[i][2];
1469:       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470:         row    = rbuf1_i[ct1] - rstart;
1471:         nzA    = ai[row+1] - ai[row];
1472:         nzB    = bi[row+1] - bi[row];
1473:         ncols  = nzA + nzB;
1474:         cworkA = aj + ai[row]; cworkB = bj + bi[row];

1476:         /* load the column indices for this row into cols*/
1477:         cols = sbuf_aj_i + ct2;

1479:         lwrite = 0;
1480:         for (l=0; l<nzB; l++) {
1481:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482:         }
1483:         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484:         for (l=0; l<nzB; l++) {
1485:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486:         }

1488:         ct2 += ncols;
1489:       }
1490:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1491:     }

1493:     /* create column map (cmap): global col of C -> local col of submat */
1494: #if defined(PETSC_USE_CTABLE)
1495:     if (!allcolumns) {
1496:       PetscTableCreate(ncol,C->cmap->N,&cmap);
1497:       PetscCalloc1(C->cmap->n,&cmap_loc);
1498:       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499:         if (icol[j] >= cstart && icol[j] <cend) {
1500:           cmap_loc[icol[j] - cstart] = j+1;
1501:         } else { /* use PetscTable for non-local col indices */
1502:           PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1503:         }
1504:       }
1505:     } else {
1506:       cmap     = NULL;
1507:       cmap_loc = NULL;
1508:     }
1509:     PetscCalloc1(C->rmap->n,&rmap_loc);
1510: #else
1511:     if (!allcolumns) {
1512:       PetscCalloc1(C->cmap->N,&cmap);
1513:       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514:     } else {
1515:       cmap = NULL;
1516:     }
1517: #endif

1519:     /* Create lens for MatSeqAIJSetPreallocation() */
1520:     PetscCalloc1(nrow,&lens);

1522:     /* Compute lens from local part of C */
1523:     for (j=0; j<nrow; j++) {
1524:       row  = irow[j];
1525:       proc = row2proc[j];
1526:       if (proc == rank) {
1527:         /* diagonal part A = c->A */
1528:         ncols = ai[row-rstart+1] - ai[row-rstart];
1529:         cols  = aj + ai[row-rstart];
1530:         if (!allcolumns) {
1531:           for (k=0; k<ncols; k++) {
1532: #if defined(PETSC_USE_CTABLE)
1533:             tcol = cmap_loc[cols[k]];
1534: #else
1535:             tcol = cmap[cols[k]+cstart];
1536: #endif
1537:             if (tcol) lens[j]++;
1538:           }
1539:         } else { /* allcolumns */
1540:           lens[j] = ncols;
1541:         }

1543:         /* off-diagonal part B = c->B */
1544:         ncols = bi[row-rstart+1] - bi[row-rstart];
1545:         cols  = bj + bi[row-rstart];
1546:         if (!allcolumns) {
1547:           for (k=0; k<ncols; k++) {
1548: #if defined(PETSC_USE_CTABLE)
1549:             PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1550: #else
1551:             tcol = cmap[bmap[cols[k]]];
1552: #endif
1553:             if (tcol) lens[j]++;
1554:           }
1555:         } else { /* allcolumns */
1556:           lens[j] += ncols;
1557:         }
1558:       }
1559:     }

1561:     /* Create row map (rmap): global row of C -> local row of submat */
1562: #if defined(PETSC_USE_CTABLE)
1563:     PetscTableCreate(nrow,C->rmap->N,&rmap);
1564:     for (j=0; j<nrow; j++) {
1565:       row  = irow[j];
1566:       proc = row2proc[j];
1567:       if (proc == rank) { /* a local row */
1568:         rmap_loc[row - rstart] = j;
1569:       } else {
1570:         PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1571:       }
1572:     }
1573: #else
1574:     PetscCalloc1(C->rmap->N,&rmap);
1575:     for (j=0; j<nrow; j++) {
1576:       rmap[irow[j]] = j;
1577:     }
1578: #endif

1580:     /* Update lens from offproc data */
1581:     /* recv a->j is done */
1582:     MPI_Waitall(nrqs,r_waits3,r_status3);
1583:     for (i=0; i<nrqs; i++) {
1584:       proc    = pa[i];
1585:       sbuf1_i = sbuf1[proc];
1586:       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587:       ct1     = 2 + 1;
1588:       ct2     = 0;
1589:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1590:       rbuf3_i = rbuf3[i]; /* received C->j */

1592:       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593:       max1   = sbuf1_i[2];
1594:       for (k=0; k<max1; k++,ct1++) {
1595: #if defined(PETSC_USE_CTABLE)
1596:         PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1597:         row--;
1598:         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599: #else
1600:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601: #endif
1602:         /* Now, store row index of submat in sbuf1_i[ct1] */
1603:         sbuf1_i[ct1] = row;

1605:         nnz = rbuf2_i[ct1];
1606:         if (!allcolumns) {
1607:           for (l=0; l<nnz; l++,ct2++) {
1608: #if defined(PETSC_USE_CTABLE)
1609:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611:             } else {
1612:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1613:             }
1614: #else
1615:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616: #endif
1617:             if (tcol) lens[row]++;
1618:           }
1619:         } else { /* allcolumns */
1620:           lens[row] += nnz;
1621:         }
1622:       }
1623:     }
1624:     MPI_Waitall(nrqr,s_waits3,s_status3);
1625:     PetscFree4(r_waits3,s_waits3,r_status3,s_status3);

1627:     /* Create the submatrices */
1628:     MatCreate(PETSC_COMM_SELF,&submat);
1629:     MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);

1631:     ISGetBlockSize(isrow[0],&i);
1632:     ISGetBlockSize(iscol[0],&j);
1633:     MatSetBlockSizes(submat,i,j);
1634:     MatSetType(submat,((PetscObject)A)->type_name);
1635:     MatSeqAIJSetPreallocation(submat,0,lens);

1637:     /* create struct Mat_SubSppt and attached it to submat */
1638:     PetscNew(&smatis1);
1639:     subc = (Mat_SeqAIJ*)submat->data;
1640:     subc->submatis1 = smatis1;

1642:     smatis1->id          = 0;
1643:     smatis1->nrqs        = nrqs;
1644:     smatis1->nrqr        = nrqr;
1645:     smatis1->rbuf1       = rbuf1;
1646:     smatis1->rbuf2       = rbuf2;
1647:     smatis1->rbuf3       = rbuf3;
1648:     smatis1->sbuf2       = sbuf2;
1649:     smatis1->req_source2 = req_source2;

1651:     smatis1->sbuf1       = sbuf1;
1652:     smatis1->ptr         = ptr;
1653:     smatis1->tmp         = tmp;
1654:     smatis1->ctr         = ctr;

1656:     smatis1->pa           = pa;
1657:     smatis1->req_size     = req_size;
1658:     smatis1->req_source1  = req_source1;

1660:     smatis1->allcolumns  = allcolumns;
1661:     smatis1->singleis    = PETSC_TRUE;
1662:     smatis1->row2proc    = row2proc;
1663:     smatis1->rmap        = rmap;
1664:     smatis1->cmap        = cmap;
1665: #if defined(PETSC_USE_CTABLE)
1666:     smatis1->rmap_loc    = rmap_loc;
1667:     smatis1->cmap_loc    = cmap_loc;
1668: #endif

1670:     smatis1->destroy     = submat->ops->destroy;
1671:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672:     submat->factortype   = C->factortype;

1674:     /* compute rmax */
1675:     rmax = 0;
1676:     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);

1678:   } else { /* scall == MAT_REUSE_MATRIX */
1679:     submat = submats[0];
1680:     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

1682:     subc    = (Mat_SeqAIJ*)submat->data;
1683:     rmax    = subc->rmax;
1684:     smatis1 = subc->submatis1;
1685:     nrqs        = smatis1->nrqs;
1686:     nrqr        = smatis1->nrqr;
1687:     rbuf1       = smatis1->rbuf1;
1688:     rbuf2       = smatis1->rbuf2;
1689:     rbuf3       = smatis1->rbuf3;
1690:     req_source2 = smatis1->req_source2;

1692:     sbuf1     = smatis1->sbuf1;
1693:     sbuf2     = smatis1->sbuf2;
1694:     ptr       = smatis1->ptr;
1695:     tmp       = smatis1->tmp;
1696:     ctr       = smatis1->ctr;

1698:     pa         = smatis1->pa;
1699:     req_size   = smatis1->req_size;
1700:     req_source1 = smatis1->req_source1;

1702:     allcolumns = smatis1->allcolumns;
1703:     row2proc   = smatis1->row2proc;
1704:     rmap       = smatis1->rmap;
1705:     cmap       = smatis1->cmap;
1706: #if defined(PETSC_USE_CTABLE)
1707:     rmap_loc   = smatis1->rmap_loc;
1708:     cmap_loc   = smatis1->cmap_loc;
1709: #endif
1710:   }

1712:   /* Post recv matrix values */
1713:   PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);
1714:   PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);
1715:   PetscObjectGetNewTag((PetscObject)C,&tag4);
1716:   for (i=0; i<nrqs; ++i) {
1717:     PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
1718:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1719:   }

1721:   /* Allocate sending buffers for a->a, and send them off */
1722:   PetscMalloc1(nrqr,&sbuf_aa);
1723:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724:   if (nrqr) {PetscMalloc1(j,&sbuf_aa[0]);}
1725:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1727:   for (i=0; i<nrqr; i++) {
1728:     rbuf1_i   = rbuf1[i];
1729:     sbuf_aa_i = sbuf_aa[i];
1730:     ct1       = 2*rbuf1_i[0]+1;
1731:     ct2       = 0;
1732:     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1734:     kmax = rbuf1_i[2];
1735:     for (k=0; k<kmax; k++,ct1++) {
1736:       row = rbuf1_i[ct1] - rstart;
1737:       nzA = ai[row+1] - ai[row];
1738:       nzB = bi[row+1] - bi[row];
1739:       ncols  = nzA + nzB;
1740:       cworkB = bj + bi[row];
1741:       vworkA = a_a + ai[row];
1742:       vworkB = b_a + bi[row];

1744:       /* load the column values for this row into vals*/
1745:       vals = sbuf_aa_i + ct2;

1747:       lwrite = 0;
1748:       for (l=0; l<nzB; l++) {
1749:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750:       }
1751:       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752:       for (l=0; l<nzB; l++) {
1753:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754:       }

1756:       ct2 += ncols;
1757:     }
1758:     MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1759:   }

1761:   /* Assemble submat */
1762:   /* First assemble the local rows */
1763:   for (j=0; j<nrow; j++) {
1764:     row  = irow[j];
1765:     proc = row2proc[j];
1766:     if (proc == rank) {
1767:       Crow = row - rstart;  /* local row index of C */
1768: #if defined(PETSC_USE_CTABLE)
1769:       row = rmap_loc[Crow]; /* row index of submat */
1770: #else
1771:       row = rmap[row];
1772: #endif

1774:       if (allcolumns) {
1775:         /* diagonal part A = c->A */
1776:         ncols = ai[Crow+1] - ai[Crow];
1777:         cols  = aj + ai[Crow];
1778:         vals  = a_a + ai[Crow];
1779:         i     = 0;
1780:         for (k=0; k<ncols; k++) {
1781:           subcols[i]   = cols[k] + cstart;
1782:           subvals[i++] = vals[k];
1783:         }

1785:         /* off-diagonal part B = c->B */
1786:         ncols = bi[Crow+1] - bi[Crow];
1787:         cols  = bj + bi[Crow];
1788:         vals  = b_a + bi[Crow];
1789:         for (k=0; k<ncols; k++) {
1790:           subcols[i]   = bmap[cols[k]];
1791:           subvals[i++] = vals[k];
1792:         }

1794:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);

1796:       } else { /* !allcolumns */
1797: #if defined(PETSC_USE_CTABLE)
1798:         /* diagonal part A = c->A */
1799:         ncols = ai[Crow+1] - ai[Crow];
1800:         cols  = aj + ai[Crow];
1801:         vals  = a_a + ai[Crow];
1802:         i     = 0;
1803:         for (k=0; k<ncols; k++) {
1804:           tcol = cmap_loc[cols[k]];
1805:           if (tcol) {
1806:             subcols[i]   = --tcol;
1807:             subvals[i++] = vals[k];
1808:           }
1809:         }

1811:         /* off-diagonal part B = c->B */
1812:         ncols = bi[Crow+1] - bi[Crow];
1813:         cols  = bj + bi[Crow];
1814:         vals  = b_a + bi[Crow];
1815:         for (k=0; k<ncols; k++) {
1816:           PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1817:           if (tcol) {
1818:             subcols[i]   = --tcol;
1819:             subvals[i++] = vals[k];
1820:           }
1821:         }
1822: #else
1823:         /* diagonal part A = c->A */
1824:         ncols = ai[Crow+1] - ai[Crow];
1825:         cols  = aj + ai[Crow];
1826:         vals  = a_a + ai[Crow];
1827:         i     = 0;
1828:         for (k=0; k<ncols; k++) {
1829:           tcol = cmap[cols[k]+cstart];
1830:           if (tcol) {
1831:             subcols[i]   = --tcol;
1832:             subvals[i++] = vals[k];
1833:           }
1834:         }

1836:         /* off-diagonal part B = c->B */
1837:         ncols = bi[Crow+1] - bi[Crow];
1838:         cols  = bj + bi[Crow];
1839:         vals  = b_a + bi[Crow];
1840:         for (k=0; k<ncols; k++) {
1841:           tcol = cmap[bmap[cols[k]]];
1842:           if (tcol) {
1843:             subcols[i]   = --tcol;
1844:             subvals[i++] = vals[k];
1845:           }
1846:         }
1847: #endif
1848:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1849:       }
1850:     }
1851:   }

1853:   /* Now assemble the off-proc rows */
1854:   for (i=0; i<nrqs; i++) { /* for each requested message */
1855:     /* recv values from other processes */
1856:     MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1857:     proc    = pa[idex];
1858:     sbuf1_i = sbuf1[proc];
1859:     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1860:     ct1     = 2 + 1;
1861:     ct2     = 0; /* count of received C->j */
1862:     ct3     = 0; /* count of received C->j that will be inserted into submat */
1863:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1867:     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868:     max1 = sbuf1_i[2];             /* num of rows */
1869:     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870:       row = sbuf1_i[ct1]; /* row index of submat */
1871:       if (!allcolumns) {
1872:         idex = 0;
1873:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874:           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875:           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876: #if defined(PETSC_USE_CTABLE)
1877:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879:             } else {
1880:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1881:             }
1882: #else
1883:             tcol = cmap[rbuf3_i[ct2]];
1884: #endif
1885:             if (tcol) {
1886:               subcols[idex]   = --tcol; /* may not be sorted */
1887:               subvals[idex++] = rbuf4_i[ct2];

1889:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890:                For reuse, we replace received C->j with index that should be inserted to submat */
1891:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892:             }
1893:           }
1894:           MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1895:         } else { /* scall == MAT_REUSE_MATRIX */
1896:           submat = submats[0];
1897:           subc   = (Mat_SeqAIJ*)submat->data;

1899:           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900:           for (l=0; l<nnz; l++) {
1901:             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902:             subvals[idex++] = rbuf4_i[ct2];
1903:           }

1905:           bj = subc->j + subc->i[row]; /* sorted column indices */
1906:           MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1907:         }
1908:       } else { /* allcolumns */
1909:         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1910:         MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1911:         ct2 += nnz;
1912:       }
1913:     }
1914:   }

1916:   /* sending a->a are done */
1917:   MPI_Waitall(nrqr,s_waits4,s_status4);
1918:   PetscFree4(r_waits4,s_waits4,r_status4,s_status4);

1920:   MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1921:   MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1922:   submats[0] = submat;

1924:   /* Restore the indices */
1925:   ISRestoreIndices(isrow[0],&irow);
1926:   if (!allcolumns) {
1927:     ISRestoreIndices(iscol[0],&icol);
1928:   }

1930:   /* Destroy allocated memory */
1931:   for (i=0; i<nrqs; ++i) {
1932:     PetscFree(rbuf4[i]);
1933:   }
1934:   PetscFree3(rbuf4,subcols,subvals);
1935:   if (sbuf_aa) {
1936:     PetscFree(sbuf_aa[0]);
1937:     PetscFree(sbuf_aa);
1938:   }

1940:   if (scall == MAT_INITIAL_MATRIX) {
1941:     PetscFree(lens);
1942:     if (sbuf_aj) {
1943:       PetscFree(sbuf_aj[0]);
1944:       PetscFree(sbuf_aj);
1945:     }
1946:   }
1947:   MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1948:   MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1949:   return(0);
1950: }

1952: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1953: {
1955:   PetscInt       ncol;
1956:   PetscBool      colflag,allcolumns=PETSC_FALSE;

1959:   /* Allocate memory to hold all the submatrices */
1960:   if (scall == MAT_INITIAL_MATRIX) {
1961:     PetscCalloc1(2,submat);
1962:   }

1964:   /* Check for special case: each processor gets entire matrix columns */
1965:   ISIdentity(iscol[0],&colflag);
1966:   ISGetLocalSize(iscol[0],&ncol);
1967:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1969:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1970:   return(0);
1971: }

1973: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1974: {
1976:   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1977:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1978:   Mat_SeqAIJ     *subc;
1979:   Mat_SubSppt    *smat;

1982:   /* Check for special case: each processor has a single IS */
1983:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1984:     MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1985:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1986:     return(0);
1987:   }

1989:   /* Collect global wantallmatrix and nstages */
1990:   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1991:   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1992:   if (!nmax) nmax = 1;

1994:   if (scall == MAT_INITIAL_MATRIX) {
1995:     /* Collect global wantallmatrix and nstages */
1996:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1997:       ISIdentity(*isrow,&rowflag);
1998:       ISIdentity(*iscol,&colflag);
1999:       ISGetLocalSize(*isrow,&nrow);
2000:       ISGetLocalSize(*iscol,&ncol);
2001:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2002:         wantallmatrix = PETSC_TRUE;

2004:         PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
2005:       }
2006:     }

2008:     /* Determine the number of stages through which submatrices are done
2009:        Each stage will extract nmax submatrices.
2010:        nmax is determined by the matrix column dimension.
2011:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2012:     */
2013:     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

2015:     in[0] = -1*(PetscInt)wantallmatrix;
2016:     in[1] = nstages;
2017:     MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2018:     wantallmatrix = (PetscBool)(-out[0]);
2019:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

2021:   } else { /* MAT_REUSE_MATRIX */
2022:     if (ismax) {
2023:       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2024:       smat = subc->submatis1;
2025:     } else { /* (*submat)[0] is a dummy matrix */
2026:       smat = (Mat_SubSppt*)(*submat)[0]->data;
2027:     }
2028:     if (!smat) {
2029:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2030:       wantallmatrix = PETSC_TRUE;
2031:     } else if (smat->singleis) {
2032:       MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2033:       return(0);
2034:     } else {
2035:       nstages = smat->nstages;
2036:     }
2037:   }

2039:   if (wantallmatrix) {
2040:     MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2041:     return(0);
2042:   }

2044:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2045:   if (scall == MAT_INITIAL_MATRIX) {
2046:     PetscCalloc1(ismax+nstages,submat);
2047:   }

2049:   for (i=0,pos=0; i<nstages; i++) {
2050:     if (pos+nmax <= ismax) max_no = nmax;
2051:     else if (pos >= ismax) max_no = 0;
2052:     else                   max_no = ismax-pos;

2054:     MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2055:     if (!max_no) {
2056:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2057:         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2058:         smat->nstages = nstages;
2059:       }
2060:       pos++; /* advance to next dummy matrix if any */
2061:     } else pos += max_no;
2062:   }

2064:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2065:     /* save nstages for reuse */
2066:     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2067:     smat = subc->submatis1;
2068:     smat->nstages = nstages;
2069:   }
2070:   return(0);
2071: }

2073: /* -------------------------------------------------------------------------*/
2074: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2075: {
2076:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2077:   Mat            A  = c->A;
2078:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2079:   const PetscInt **icol,**irow;
2080:   PetscInt       *nrow,*ncol,start;
2082:   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2083:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2084:   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2085:   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2086:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2087: #if defined(PETSC_USE_CTABLE)
2088:   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2089: #else
2090:   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2091: #endif
2092:   const PetscInt *irow_i;
2093:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2094:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2095:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2096:   MPI_Comm       comm;
2097:   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2098:   PetscMPIInt    *onodes1,*olengths1,end;
2099:   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2100:   Mat_SubSppt    *smat_i;
2101:   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2102:   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;

2105:   PetscObjectGetComm((PetscObject)C,&comm);
2106:   size = c->size;
2107:   rank = c->rank;

2109:   PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2110:   PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);

2112:   for (i=0; i<ismax; i++) {
2113:     ISSorted(iscol[i],&issorted[i]);
2114:     if (!issorted[i]) iscsorted = issorted[i];

2116:     ISSorted(isrow[i],&issorted[i]);

2118:     ISGetIndices(isrow[i],&irow[i]);
2119:     ISGetLocalSize(isrow[i],&nrow[i]);

2121:     /* Check for special case: allcolumn */
2122:     ISIdentity(iscol[i],&colflag);
2123:     ISGetLocalSize(iscol[i],&ncol[i]);
2124:     if (colflag && ncol[i] == C->cmap->N) {
2125:       allcolumns[i] = PETSC_TRUE;
2126:       icol[i] = NULL;
2127:     } else {
2128:       allcolumns[i] = PETSC_FALSE;
2129:       ISGetIndices(iscol[i],&icol[i]);
2130:     }
2131:   }

2133:   if (scall == MAT_REUSE_MATRIX) {
2134:     /* Assumes new rows are same length as the old rows */
2135:     for (i=0; i<ismax; i++) {
2136:       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2137:       subc = (Mat_SeqAIJ*)submats[i]->data;
2138:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

2140:       /* Initial matrix as if empty */
2141:       PetscArrayzero(subc->ilen,submats[i]->rmap->n);

2143:       smat_i   = subc->submatis1;

2145:       nrqs        = smat_i->nrqs;
2146:       nrqr        = smat_i->nrqr;
2147:       rbuf1       = smat_i->rbuf1;
2148:       rbuf2       = smat_i->rbuf2;
2149:       rbuf3       = smat_i->rbuf3;
2150:       req_source2 = smat_i->req_source2;

2152:       sbuf1     = smat_i->sbuf1;
2153:       sbuf2     = smat_i->sbuf2;
2154:       ptr       = smat_i->ptr;
2155:       tmp       = smat_i->tmp;
2156:       ctr       = smat_i->ctr;

2158:       pa          = smat_i->pa;
2159:       req_size    = smat_i->req_size;
2160:       req_source1 = smat_i->req_source1;

2162:       allcolumns[i] = smat_i->allcolumns;
2163:       row2proc[i]   = smat_i->row2proc;
2164:       rmap[i]       = smat_i->rmap;
2165:       cmap[i]       = smat_i->cmap;
2166:     }

2168:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2169:       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2170:       smat_i = (Mat_SubSppt*)submats[0]->data;

2172:       nrqs        = smat_i->nrqs;
2173:       nrqr        = smat_i->nrqr;
2174:       rbuf1       = smat_i->rbuf1;
2175:       rbuf2       = smat_i->rbuf2;
2176:       rbuf3       = smat_i->rbuf3;
2177:       req_source2 = smat_i->req_source2;

2179:       sbuf1       = smat_i->sbuf1;
2180:       sbuf2       = smat_i->sbuf2;
2181:       ptr         = smat_i->ptr;
2182:       tmp         = smat_i->tmp;
2183:       ctr         = smat_i->ctr;

2185:       pa          = smat_i->pa;
2186:       req_size    = smat_i->req_size;
2187:       req_source1 = smat_i->req_source1;

2189:       allcolumns[0] = PETSC_FALSE;
2190:     }
2191:   } else { /* scall == MAT_INITIAL_MATRIX */
2192:     /* Get some new tags to keep the communication clean */
2193:     PetscObjectGetNewTag((PetscObject)C,&tag2);
2194:     PetscObjectGetNewTag((PetscObject)C,&tag3);

2196:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2197:      required. Based on this, buffers are allocated, and data copied into them*/
2198:     PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size, initialize work vectors */

2200:     for (i=0; i<ismax; i++) {
2201:       jmax   = nrow[i];
2202:       irow_i = irow[i];

2204:       PetscMalloc1(jmax,&row2proc_i);
2205:       row2proc[i] = row2proc_i;

2207:       if (issorted[i]) proc = 0;
2208:       for (j=0; j<jmax; j++) {
2209:         if (!issorted[i]) proc = 0;
2210:         row = irow_i[j];
2211:         while (row >= C->rmap->range[proc+1]) proc++;
2212:         w4[proc]++;
2213:         row2proc_i[j] = proc; /* map row index to proc */
2214:       }
2215:       for (j=0; j<size; j++) {
2216:         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2217:       }
2218:     }

2220:     nrqs     = 0;              /* no of outgoing messages */
2221:     msz      = 0;              /* total mesg length (for all procs) */
2222:     w1[rank] = 0;              /* no mesg sent to self */
2223:     w3[rank] = 0;
2224:     for (i=0; i<size; i++) {
2225:       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2226:     }
2227:     PetscMalloc1(nrqs,&pa); /*(proc -array)*/
2228:     for (i=0,j=0; i<size; i++) {
2229:       if (w1[i]) { pa[j] = i; j++; }
2230:     }

2232:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2233:     for (i=0; i<nrqs; i++) {
2234:       j      = pa[i];
2235:       w1[j] += w2[j] + 2* w3[j];
2236:       msz   += w1[j];
2237:     }
2238:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

2240:     /* Determine the number of messages to expect, their lengths, from from-ids */
2241:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2242:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

2244:     /* Now post the Irecvs corresponding to these messages */
2245:     PetscObjectGetNewTag((PetscObject)C,&tag0);
2246:     PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

2248:     /* Allocate Memory for outgoing messages */
2249:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2250:     PetscArrayzero(sbuf1,size);
2251:     PetscArrayzero(ptr,size);

2253:     {
2254:       PetscInt *iptr = tmp;
2255:       k    = 0;
2256:       for (i=0; i<nrqs; i++) {
2257:         j        = pa[i];
2258:         iptr    += k;
2259:         sbuf1[j] = iptr;
2260:         k        = w1[j];
2261:       }
2262:     }

2264:     /* Form the outgoing messages. Initialize the header space */
2265:     for (i=0; i<nrqs; i++) {
2266:       j           = pa[i];
2267:       sbuf1[j][0] = 0;
2268:       PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2269:       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2270:     }

2272:     /* Parse the isrow and copy data into outbuf */
2273:     for (i=0; i<ismax; i++) {
2274:       row2proc_i = row2proc[i];
2275:       PetscArrayzero(ctr,size);
2276:       irow_i = irow[i];
2277:       jmax   = nrow[i];
2278:       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2279:         proc = row2proc_i[j];
2280:         if (proc != rank) { /* copy to the outgoing buf*/
2281:           ctr[proc]++;
2282:           *ptr[proc] = irow_i[j];
2283:           ptr[proc]++;
2284:         }
2285:       }
2286:       /* Update the headers for the current IS */
2287:       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2288:         if ((ctr_j = ctr[j])) {
2289:           sbuf1_j        = sbuf1[j];
2290:           k              = ++sbuf1_j[0];
2291:           sbuf1_j[2*k]   = ctr_j;
2292:           sbuf1_j[2*k-1] = i;
2293:         }
2294:       }
2295:     }

2297:     /*  Now  post the sends */
2298:     PetscMalloc1(nrqs,&s_waits1);
2299:     for (i=0; i<nrqs; ++i) {
2300:       j    = pa[i];
2301:       MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2302:     }

2304:     /* Post Receives to capture the buffer size */
2305:     PetscMalloc1(nrqs,&r_waits2);
2306:     PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
2307:     if (nrqs) rbuf2[0] = tmp + msz;
2308:     for (i=1; i<nrqs; ++i) {
2309:       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2310:     }
2311:     for (i=0; i<nrqs; ++i) {
2312:       j    = pa[i];
2313:       MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2314:     }

2316:     /* Send to other procs the buf size they should allocate */
2317:     /* Receive messages*/
2318:     PetscMalloc1(nrqr,&s_waits2);
2319:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2320:     {
2321:       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2322:       PetscInt   *sbuf2_i;

2324:       MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
2325:       for (i=0; i<nrqr; ++i) {
2326:         req_size[i] = 0;
2327:         rbuf1_i        = rbuf1[i];
2328:         start          = 2*rbuf1_i[0] + 1;
2329:         end            = olengths1[i];
2330:         PetscMalloc1(end,&sbuf2[i]);
2331:         sbuf2_i        = sbuf2[i];
2332:         for (j=start; j<end; j++) {
2333:           id              = rbuf1_i[j] - rstart;
2334:           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2335:           sbuf2_i[j]      = ncols;
2336:           req_size[i] += ncols;
2337:         }
2338:         req_source1[i] = onodes1[i];
2339:         /* form the header */
2340:         sbuf2_i[0] = req_size[i];
2341:         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

2343:         MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2344:       }
2345:     }

2347:     PetscFree(onodes1);
2348:     PetscFree(olengths1);
2349:     PetscFree(r_waits1);
2350:     PetscFree4(w1,w2,w3,w4);

2352:     /* Receive messages*/
2353:     PetscMalloc1(nrqs,&r_waits3);
2354:     MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
2355:     for (i=0; i<nrqs; ++i) {
2356:       PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
2357:       req_source2[i] = pa[i];
2358:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2359:     }
2360:     PetscFree(r_waits2);

2362:     /* Wait on sends1 and sends2 */
2363:     MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
2364:     MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
2365:     PetscFree(s_waits1);
2366:     PetscFree(s_waits2);

2368:     /* Now allocate sending buffers for a->j, and send them off */
2369:     PetscMalloc1(nrqr,&sbuf_aj);
2370:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2371:     if (nrqr) {PetscMalloc1(j,&sbuf_aj[0]);}
2372:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

2374:     PetscMalloc1(nrqr,&s_waits3);
2375:     {
2376:       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2377:       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2378:       PetscInt cend = C->cmap->rend;
2379:       PetscInt *a_j = a->j,*b_j = b->j,ctmp;

2381:       for (i=0; i<nrqr; i++) {
2382:         rbuf1_i   = rbuf1[i];
2383:         sbuf_aj_i = sbuf_aj[i];
2384:         ct1       = 2*rbuf1_i[0] + 1;
2385:         ct2       = 0;
2386:         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2387:           kmax = rbuf1[i][2*j];
2388:           for (k=0; k<kmax; k++,ct1++) {
2389:             row    = rbuf1_i[ct1] - rstart;
2390:             nzA    = a_i[row+1] - a_i[row];
2391:             nzB    = b_i[row+1] - b_i[row];
2392:             ncols  = nzA + nzB;
2393:             cworkA = a_j + a_i[row];
2394:             cworkB = b_j + b_i[row];

2396:             /* load the column indices for this row into cols */
2397:             cols = sbuf_aj_i + ct2;

2399:             lwrite = 0;
2400:             for (l=0; l<nzB; l++) {
2401:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2402:             }
2403:             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2404:             for (l=0; l<nzB; l++) {
2405:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2406:             }

2408:             ct2 += ncols;
2409:           }
2410:         }
2411:         MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2412:       }
2413:     }

2415:     /* create col map: global col of C -> local col of submatrices */
2416:     {
2417:       const PetscInt *icol_i;
2418: #if defined(PETSC_USE_CTABLE)
2419:       for (i=0; i<ismax; i++) {
2420:         if (!allcolumns[i]) {
2421:           PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);

2423:           jmax   = ncol[i];
2424:           icol_i = icol[i];
2425:           cmap_i = cmap[i];
2426:           for (j=0; j<jmax; j++) {
2427:             PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2428:           }
2429:         } else cmap[i] = NULL;
2430:       }
2431: #else
2432:       for (i=0; i<ismax; i++) {
2433:         if (!allcolumns[i]) {
2434:           PetscCalloc1(C->cmap->N,&cmap[i]);
2435:           jmax   = ncol[i];
2436:           icol_i = icol[i];
2437:           cmap_i = cmap[i];
2438:           for (j=0; j<jmax; j++) {
2439:             cmap_i[icol_i[j]] = j+1;
2440:           }
2441:         } else cmap[i] = NULL;
2442:       }
2443: #endif
2444:     }

2446:     /* Create lens which is required for MatCreate... */
2447:     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2448:     PetscMalloc1(ismax,&lens);

2450:     if (ismax) {
2451:       PetscCalloc1(j,&lens[0]);
2452:     }
2453:     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

2455:     /* Update lens from local data */
2456:     for (i=0; i<ismax; i++) {
2457:       row2proc_i = row2proc[i];
2458:       jmax = nrow[i];
2459:       if (!allcolumns[i]) cmap_i = cmap[i];
2460:       irow_i = irow[i];
2461:       lens_i = lens[i];
2462:       for (j=0; j<jmax; j++) {
2463:         row = irow_i[j];
2464:         proc = row2proc_i[j];
2465:         if (proc == rank) {
2466:           MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2467:           if (!allcolumns[i]) {
2468:             for (k=0; k<ncols; k++) {
2469: #if defined(PETSC_USE_CTABLE)
2470:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
2471: #else
2472:               tcol = cmap_i[cols[k]];
2473: #endif
2474:               if (tcol) lens_i[j]++;
2475:             }
2476:           } else { /* allcolumns */
2477:             lens_i[j] = ncols;
2478:           }
2479:           MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2480:         }
2481:       }
2482:     }

2484:     /* Create row map: global row of C -> local row of submatrices */
2485: #if defined(PETSC_USE_CTABLE)
2486:     for (i=0; i<ismax; i++) {
2487:       PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);
2488:       irow_i = irow[i];
2489:       jmax   = nrow[i];
2490:       for (j=0; j<jmax; j++) {
2491:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2492:       }
2493:     }
2494: #else
2495:     for (i=0; i<ismax; i++) {
2496:       PetscCalloc1(C->rmap->N,&rmap[i]);
2497:       rmap_i = rmap[i];
2498:       irow_i = irow[i];
2499:       jmax   = nrow[i];
2500:       for (j=0; j<jmax; j++) {
2501:         rmap_i[irow_i[j]] = j;
2502:       }
2503:     }
2504: #endif

2506:     /* Update lens from offproc data */
2507:     {
2508:       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

2510:       MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
2511:       for (tmp2=0; tmp2<nrqs; tmp2++) {
2512:         sbuf1_i = sbuf1[pa[tmp2]];
2513:         jmax    = sbuf1_i[0];
2514:         ct1     = 2*jmax+1;
2515:         ct2     = 0;
2516:         rbuf2_i = rbuf2[tmp2];
2517:         rbuf3_i = rbuf3[tmp2];
2518:         for (j=1; j<=jmax; j++) {
2519:           is_no  = sbuf1_i[2*j-1];
2520:           max1   = sbuf1_i[2*j];
2521:           lens_i = lens[is_no];
2522:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2523:           rmap_i = rmap[is_no];
2524:           for (k=0; k<max1; k++,ct1++) {
2525: #if defined(PETSC_USE_CTABLE)
2526:             PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2527:             row--;
2528:             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2529: #else
2530:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2531: #endif
2532:             max2 = rbuf2_i[ct1];
2533:             for (l=0; l<max2; l++,ct2++) {
2534:               if (!allcolumns[is_no]) {
2535: #if defined(PETSC_USE_CTABLE)
2536:                 PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2537: #else
2538:                 tcol = cmap_i[rbuf3_i[ct2]];
2539: #endif
2540:                 if (tcol) lens_i[row]++;
2541:               } else { /* allcolumns */
2542:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2543:               }
2544:             }
2545:           }
2546:         }
2547:       }
2548:     }
2549:     PetscFree(r_waits3);
2550:     MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
2551:     PetscFree(s_waits3);

2553:     /* Create the submatrices */
2554:     for (i=0; i<ismax; i++) {
2555:       PetscInt    rbs,cbs;

2557:       ISGetBlockSize(isrow[i],&rbs);
2558:       ISGetBlockSize(iscol[i],&cbs);

2560:       MatCreate(PETSC_COMM_SELF,submats+i);
2561:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

2563:       MatSetBlockSizes(submats[i],rbs,cbs);
2564:       MatSetType(submats[i],((PetscObject)A)->type_name);
2565:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);

2567:       /* create struct Mat_SubSppt and attached it to submat */
2568:       PetscNew(&smat_i);
2569:       subc = (Mat_SeqAIJ*)submats[i]->data;
2570:       subc->submatis1 = smat_i;

2572:       smat_i->destroy          = submats[i]->ops->destroy;
2573:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2574:       submats[i]->factortype   = C->factortype;

2576:       smat_i->id          = i;
2577:       smat_i->nrqs        = nrqs;
2578:       smat_i->nrqr        = nrqr;
2579:       smat_i->rbuf1       = rbuf1;
2580:       smat_i->rbuf2       = rbuf2;
2581:       smat_i->rbuf3       = rbuf3;
2582:       smat_i->sbuf2       = sbuf2;
2583:       smat_i->req_source2 = req_source2;

2585:       smat_i->sbuf1       = sbuf1;
2586:       smat_i->ptr         = ptr;
2587:       smat_i->tmp         = tmp;
2588:       smat_i->ctr         = ctr;

2590:       smat_i->pa           = pa;
2591:       smat_i->req_size     = req_size;
2592:       smat_i->req_source1  = req_source1;

2594:       smat_i->allcolumns  = allcolumns[i];
2595:       smat_i->singleis    = PETSC_FALSE;
2596:       smat_i->row2proc    = row2proc[i];
2597:       smat_i->rmap        = rmap[i];
2598:       smat_i->cmap        = cmap[i];
2599:     }

2601:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2602:       MatCreate(PETSC_COMM_SELF,&submats[0]);
2603:       MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2604:       MatSetType(submats[0],MATDUMMY);

2606:       /* create struct Mat_SubSppt and attached it to submat */
2607:       PetscNewLog(submats[0],&smat_i);
2608:       submats[0]->data = (void*)smat_i;

2610:       smat_i->destroy          = submats[0]->ops->destroy;
2611:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2612:       submats[0]->factortype   = C->factortype;

2614:       smat_i->id          = 0;
2615:       smat_i->nrqs        = nrqs;
2616:       smat_i->nrqr        = nrqr;
2617:       smat_i->rbuf1       = rbuf1;
2618:       smat_i->rbuf2       = rbuf2;
2619:       smat_i->rbuf3       = rbuf3;
2620:       smat_i->sbuf2       = sbuf2;
2621:       smat_i->req_source2 = req_source2;

2623:       smat_i->sbuf1       = sbuf1;
2624:       smat_i->ptr         = ptr;
2625:       smat_i->tmp         = tmp;
2626:       smat_i->ctr         = ctr;

2628:       smat_i->pa           = pa;
2629:       smat_i->req_size     = req_size;
2630:       smat_i->req_source1  = req_source1;

2632:       smat_i->allcolumns  = PETSC_FALSE;
2633:       smat_i->singleis    = PETSC_FALSE;
2634:       smat_i->row2proc    = NULL;
2635:       smat_i->rmap        = NULL;
2636:       smat_i->cmap        = NULL;
2637:     }

2639:     if (ismax) {PetscFree(lens[0]);}
2640:     PetscFree(lens);
2641:     if (sbuf_aj) {
2642:       PetscFree(sbuf_aj[0]);
2643:       PetscFree(sbuf_aj);
2644:     }

2646:   } /* endof scall == MAT_INITIAL_MATRIX */

2648:   /* Post recv matrix values */
2649:   PetscObjectGetNewTag((PetscObject)C,&tag4);
2650:   PetscMalloc1(nrqs,&rbuf4);
2651:   PetscMalloc1(nrqs,&r_waits4);
2652:   for (i=0; i<nrqs; ++i) {
2653:     PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
2654:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2655:   }

2657:   /* Allocate sending buffers for a->a, and send them off */
2658:   PetscMalloc1(nrqr,&sbuf_aa);
2659:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2660:   if (nrqr) {PetscMalloc1(j,&sbuf_aa[0]);}
2661:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

2663:   PetscMalloc1(nrqr,&s_waits4);
2664:   {
2665:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2666:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2667:     PetscInt    cend   = C->cmap->rend;
2668:     PetscInt    *b_j   = b->j;
2669:     PetscScalar *vworkA,*vworkB,*a_a,*b_a;

2671:     MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2672:     MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2673:     for (i=0; i<nrqr; i++) {
2674:       rbuf1_i   = rbuf1[i];
2675:       sbuf_aa_i = sbuf_aa[i];
2676:       ct1       = 2*rbuf1_i[0]+1;
2677:       ct2       = 0;
2678:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2679:         kmax = rbuf1_i[2*j];
2680:         for (k=0; k<kmax; k++,ct1++) {
2681:           row    = rbuf1_i[ct1] - rstart;
2682:           nzA    = a_i[row+1] - a_i[row];
2683:           nzB    = b_i[row+1] - b_i[row];
2684:           ncols  = nzA + nzB;
2685:           cworkB = b_j + b_i[row];
2686:           vworkA = a_a + a_i[row];
2687:           vworkB = b_a + b_i[row];

2689:           /* load the column values for this row into vals*/
2690:           vals = sbuf_aa_i+ct2;

2692:           lwrite = 0;
2693:           for (l=0; l<nzB; l++) {
2694:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2695:           }
2696:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2697:           for (l=0; l<nzB; l++) {
2698:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2699:           }

2701:           ct2 += ncols;
2702:         }
2703:       }
2704:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2705:     }
2706:     MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2707:     MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2708:   }

2710:   /* Assemble the matrices */
2711:   /* First assemble the local rows */
2712:   for (i=0; i<ismax; i++) {
2713:     row2proc_i = row2proc[i];
2714:     subc      = (Mat_SeqAIJ*)submats[i]->data;
2715:     imat_ilen = subc->ilen;
2716:     imat_j    = subc->j;
2717:     imat_i    = subc->i;
2718:     imat_a    = subc->a;

2720:     if (!allcolumns[i]) cmap_i = cmap[i];
2721:     rmap_i = rmap[i];
2722:     irow_i = irow[i];
2723:     jmax   = nrow[i];
2724:     for (j=0; j<jmax; j++) {
2725:       row  = irow_i[j];
2726:       proc = row2proc_i[j];
2727:       if (proc == rank) {
2728:         old_row = row;
2729: #if defined(PETSC_USE_CTABLE)
2730:         PetscTableFind(rmap_i,row+1,&row);
2731:         row--;
2732: #else
2733:         row = rmap_i[row];
2734: #endif
2735:         ilen_row = imat_ilen[row];
2736:         MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2737:         mat_i    = imat_i[row];
2738:         mat_a    = imat_a + mat_i;
2739:         mat_j    = imat_j + mat_i;
2740:         if (!allcolumns[i]) {
2741:           for (k=0; k<ncols; k++) {
2742: #if defined(PETSC_USE_CTABLE)
2743:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
2744: #else
2745:             tcol = cmap_i[cols[k]];
2746: #endif
2747:             if (tcol) {
2748:               *mat_j++ = tcol - 1;
2749:               *mat_a++ = vals[k];
2750:               ilen_row++;
2751:             }
2752:           }
2753:         } else { /* allcolumns */
2754:           for (k=0; k<ncols; k++) {
2755:             *mat_j++ = cols[k];  /* global col index! */
2756:             *mat_a++ = vals[k];
2757:             ilen_row++;
2758:           }
2759:         }
2760:         MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

2762:         imat_ilen[row] = ilen_row;
2763:       }
2764:     }
2765:   }

2767:   /* Now assemble the off proc rows */
2768:   MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
2769:   for (tmp2=0; tmp2<nrqs; tmp2++) {
2770:     sbuf1_i = sbuf1[pa[tmp2]];
2771:     jmax    = sbuf1_i[0];
2772:     ct1     = 2*jmax + 1;
2773:     ct2     = 0;
2774:     rbuf2_i = rbuf2[tmp2];
2775:     rbuf3_i = rbuf3[tmp2];
2776:     rbuf4_i = rbuf4[tmp2];
2777:     for (j=1; j<=jmax; j++) {
2778:       is_no     = sbuf1_i[2*j-1];
2779:       rmap_i    = rmap[is_no];
2780:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2781:       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2782:       imat_ilen = subc->ilen;
2783:       imat_j    = subc->j;
2784:       imat_i    = subc->i;
2785:       imat_a    = subc->a;
2786:       max1      = sbuf1_i[2*j];
2787:       for (k=0; k<max1; k++,ct1++) {
2788:         row = sbuf1_i[ct1];
2789: #if defined(PETSC_USE_CTABLE)
2790:         PetscTableFind(rmap_i,row+1,&row);
2791:         row--;
2792: #else
2793:         row = rmap_i[row];
2794: #endif
2795:         ilen  = imat_ilen[row];
2796:         mat_i = imat_i[row];
2797:         mat_a = imat_a + mat_i;
2798:         mat_j = imat_j + mat_i;
2799:         max2  = rbuf2_i[ct1];
2800:         if (!allcolumns[is_no]) {
2801:           for (l=0; l<max2; l++,ct2++) {
2802: #if defined(PETSC_USE_CTABLE)
2803:             PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2804: #else
2805:             tcol = cmap_i[rbuf3_i[ct2]];
2806: #endif
2807:             if (tcol) {
2808:               *mat_j++ = tcol - 1;
2809:               *mat_a++ = rbuf4_i[ct2];
2810:               ilen++;
2811:             }
2812:           }
2813:         } else { /* allcolumns */
2814:           for (l=0; l<max2; l++,ct2++) {
2815:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2816:             *mat_a++ = rbuf4_i[ct2];
2817:             ilen++;
2818:           }
2819:         }
2820:         imat_ilen[row] = ilen;
2821:       }
2822:     }
2823:   }

2825:   if (!iscsorted) { /* sort column indices of the rows */
2826:     for (i=0; i<ismax; i++) {
2827:       subc      = (Mat_SeqAIJ*)submats[i]->data;
2828:       imat_j    = subc->j;
2829:       imat_i    = subc->i;
2830:       imat_a    = subc->a;
2831:       imat_ilen = subc->ilen;

2833:       if (allcolumns[i]) continue;
2834:       jmax = nrow[i];
2835:       for (j=0; j<jmax; j++) {
2836:         mat_i = imat_i[j];
2837:         mat_a = imat_a + mat_i;
2838:         mat_j = imat_j + mat_i;
2839:         PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2840:       }
2841:     }
2842:   }

2844:   PetscFree(r_waits4);
2845:   MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
2846:   PetscFree(s_waits4);

2848:   /* Restore the indices */
2849:   for (i=0; i<ismax; i++) {
2850:     ISRestoreIndices(isrow[i],irow+i);
2851:     if (!allcolumns[i]) {
2852:       ISRestoreIndices(iscol[i],icol+i);
2853:     }
2854:   }

2856:   for (i=0; i<ismax; i++) {
2857:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2858:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2859:   }

2861:   /* Destroy allocated memory */
2862:   if (sbuf_aa) {
2863:     PetscFree(sbuf_aa[0]);
2864:     PetscFree(sbuf_aa);
2865:   }
2866:   PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);

2868:   for (i=0; i<nrqs; ++i) {
2869:     PetscFree(rbuf4[i]);
2870:   }
2871:   PetscFree(rbuf4);

2873:   PetscFree4(row2proc,cmap,rmap,allcolumns);
2874:   return(0);
2875: }

2877: /*
2878:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2879:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2880:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2881:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2882:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2883:  state, and needs to be "assembled" later by compressing B's column space.

2885:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2886:  Following this call, C->A & C->B have been created, even if empty.
2887:  */
2888: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2889: {
2890:   /* If making this function public, change the error returned in this function away from _PLIB. */
2892:   Mat_MPIAIJ     *aij;
2893:   Mat_SeqAIJ     *Baij;
2894:   PetscBool      seqaij,Bdisassembled;
2895:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2896:   PetscScalar    v;
2897:   const PetscInt *rowindices,*colindices;

2900:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2901:   if (A) {
2902:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2903:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2904:     if (rowemb) {
2905:       ISGetLocalSize(rowemb,&m);
2906:       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2907:     } else {
2908:       if (C->rmap->n != A->rmap->n) {
2909:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2910:       }
2911:     }
2912:     if (dcolemb) {
2913:       ISGetLocalSize(dcolemb,&n);
2914:       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2915:     } else {
2916:       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2917:     }
2918:   }
2919:   if (B) {
2920:     PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2921:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2922:     if (rowemb) {
2923:       ISGetLocalSize(rowemb,&m);
2924:       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2925:     } else {
2926:       if (C->rmap->n != B->rmap->n) {
2927:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2928:       }
2929:     }
2930:     if (ocolemb) {
2931:       ISGetLocalSize(ocolemb,&n);
2932:       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2933:     } else {
2934:       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2935:     }
2936:   }

2938:   aij = (Mat_MPIAIJ*)C->data;
2939:   if (!aij->A) {
2940:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2941:     MatCreate(PETSC_COMM_SELF,&aij->A);
2942:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2943:     MatSetBlockSizesFromMats(aij->A,C,C);
2944:     MatSetType(aij->A,MATSEQAIJ);
2945:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2946:   }
2947:   if (A) {
2948:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2949:   } else {
2950:     MatSetUp(aij->A);
2951:   }
2952:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2953:     /*
2954:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2955:       need to "disassemble" B -- convert it to using C's global indices.
2956:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2958:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2959:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2961:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2962:       At least avoid calling MatSetValues() and the implied searches?
2963:     */

2965:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2966: #if defined(PETSC_USE_CTABLE)
2967:       PetscTableDestroy(&aij->colmap);
2968: #else
2969:       PetscFree(aij->colmap);
2970:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2971:       if (aij->B) {
2972:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2973:       }
2974: #endif
2975:       ngcol = 0;
2976:       if (aij->lvec) {
2977:         VecGetSize(aij->lvec,&ngcol);
2978:       }
2979:       if (aij->garray) {
2980:         PetscFree(aij->garray);
2981:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2982:       }
2983:       VecDestroy(&aij->lvec);
2984:       VecScatterDestroy(&aij->Mvctx);
2985:     }
2986:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2987:       MatDestroy(&aij->B);
2988:     }
2989:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2990:       MatZeroEntries(aij->B);
2991:     }
2992:   }
2993:   Bdisassembled = PETSC_FALSE;
2994:   if (!aij->B) {
2995:     MatCreate(PETSC_COMM_SELF,&aij->B);
2996:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2997:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2998:     MatSetBlockSizesFromMats(aij->B,B,B);
2999:     MatSetType(aij->B,MATSEQAIJ);
3000:     Bdisassembled = PETSC_TRUE;
3001:   }
3002:   if (B) {
3003:     Baij = (Mat_SeqAIJ*)B->data;
3004:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3005:       PetscMalloc1(B->rmap->n,&nz);
3006:       for (i=0; i<B->rmap->n; i++) {
3007:         nz[i] = Baij->i[i+1] - Baij->i[i];
3008:       }
3009:       MatSeqAIJSetPreallocation(aij->B,0,nz);
3010:       PetscFree(nz);
3011:     }

3013:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
3014:     shift = rend-rstart;
3015:     count = 0;
3016:     rowindices = NULL;
3017:     colindices = NULL;
3018:     if (rowemb) {
3019:       ISGetIndices(rowemb,&rowindices);
3020:     }
3021:     if (ocolemb) {
3022:       ISGetIndices(ocolemb,&colindices);
3023:     }
3024:     for (i=0; i<B->rmap->n; i++) {
3025:       PetscInt row;
3026:       row = i;
3027:       if (rowindices) row = rowindices[i];
3028:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3029:         col  = Baij->j[count];
3030:         if (colindices) col = colindices[col];
3031:         if (Bdisassembled && col>=rstart) col += shift;
3032:         v    = Baij->a[count];
3033:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3034:         ++count;
3035:       }
3036:     }
3037:     /* No assembly for aij->B is necessary. */
3038:     /* FIXME: set aij->B's nonzerostate correctly. */
3039:   } else {
3040:     MatSetUp(aij->B);
3041:   }
3042:   C->preallocated  = PETSC_TRUE;
3043:   C->was_assembled = PETSC_FALSE;
3044:   C->assembled     = PETSC_FALSE;
3045:    /*
3046:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3047:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3048:    */
3049:   return(0);
3050: }

3052: /*
3053:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3054:  */
3055: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3056: {
3057:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;

3062:   /* FIXME: make sure C is assembled */
3063:   *A = aij->A;
3064:   *B = aij->B;
3065:   /* Note that we don't incref *A and *B, so be careful! */
3066:   return(0);
3067: }

3069: /*
3070:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3071:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3072: */
3073: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3074:                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3075:                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3076:                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3077:                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3078: {
3080:   PetscMPIInt    size,flag;
3081:   PetscInt       i,ii,cismax,ispar;
3082:   Mat            *A,*B;
3083:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

3086:   if (!ismax) return(0);

3088:   for (i = 0, cismax = 0; i < ismax; ++i) {
3089:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3090:     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3091:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3092:     if (size > 1) ++cismax;
3093:   }

3095:   /*
3096:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3097:      ispar counts the number of parallel ISs across C's comm.
3098:   */
3099:   MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3100:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3101:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3102:     return(0);
3103:   }

3105:   /* if (ispar) */
3106:   /*
3107:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3108:     These are used to extract the off-diag portion of the resulting parallel matrix.
3109:     The row IS for the off-diag portion is the same as for the diag portion,
3110:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3111:   */
3112:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3113:   PetscMalloc1(cismax,&ciscol_p);
3114:   for (i = 0, ii = 0; i < ismax; ++i) {
3115:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3116:     if (size > 1) {
3117:       /*
3118:          TODO: This is the part that's ***NOT SCALABLE***.
3119:          To fix this we need to extract just the indices of C's nonzero columns
3120:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3121:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3122:          to be done without serializing on the IS list, so, most likely, it is best
3123:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3124:       */
3125:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3126:       /* Now we have to
3127:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3128:              were sorted on each rank, concatenated they might no longer be sorted;
3129:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3130:              indices in the nondecreasing order to the original index positions.
3131:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3132:       */
3133:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3134:       ISSort(ciscol[ii]);
3135:       ++ii;
3136:     }
3137:   }
3138:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3139:   for (i = 0, ii = 0; i < ismax; ++i) {
3140:     PetscInt       j,issize;
3141:     const PetscInt *indices;

3143:     /*
3144:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3145:      */
3146:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3147:     ISSort(isrow[i]);
3148:     ISGetLocalSize(isrow[i],&issize);
3149:     ISGetIndices(isrow[i],&indices);
3150:     for (j = 1; j < issize; ++j) {
3151:       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3152:     }
3153:     ISRestoreIndices(isrow[i],&indices);
3154:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3155:     ISSort(iscol[i]);
3156:     ISGetLocalSize(iscol[i],&issize);
3157:     ISGetIndices(iscol[i],&indices);
3158:     for (j = 1; j < issize; ++j) {
3159:       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3160:     }
3161:     ISRestoreIndices(iscol[i],&indices);
3162:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3163:     if (size > 1) {
3164:       cisrow[ii] = isrow[i];
3165:       ++ii;
3166:     }
3167:   }
3168:   /*
3169:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3170:     array of sequential matrices underlying the resulting parallel matrices.
3171:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3172:     contain duplicates.

3174:     There are as many diag matrices as there are original index sets. There are only as many parallel
3175:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3177:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3178:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3179:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3180:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3181:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3182:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3183:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3184:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3185:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3186:       values into A[i] and B[ii] sitting inside the corresponding submat.
3187:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3188:       A[i], B[ii], AA[i] or BB[ii] matrices.
3189:   */
3190:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3191:   if (scall == MAT_INITIAL_MATRIX) {
3192:     PetscMalloc1(ismax,submat);
3193:   }

3195:   /* Now obtain the sequential A and B submatrices separately. */
3196:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3197:   (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3198:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);

3200:   /*
3201:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3202:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3203:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3204:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3205:     to have the same sparsity pattern.
3206:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3207:     must be constructed for C. This is done by setseqmat(s).
3208:   */
3209:   for (i = 0, ii = 0; i < ismax; ++i) {
3210:     /*
3211:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3212:        That way we can avoid sorting and computing permutations when reusing.
3213:        To this end:
3214:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3215:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3216:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3217:     */
3218:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3220:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3221:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3222:     if (size > 1) {
3223:       if (scall == MAT_INITIAL_MATRIX) {
3224:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3225:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3226:         MatSetType((*submat)[i],MATMPIAIJ);
3227:         PetscLayoutSetUp((*submat)[i]->rmap);
3228:         PetscLayoutSetUp((*submat)[i]->cmap);
3229:       }
3230:       /*
3231:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3232:       */
3233:       {
3234:         Mat AA = A[i],BB = B[ii];

3236:         if (AA || BB) {
3237:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3238:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3239:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3240:         }
3241:         MatDestroy(&AA);
3242:       }
3243:       ISDestroy(ciscol+ii);
3244:       ISDestroy(ciscol_p+ii);
3245:       ++ii;
3246:     } else { /* if (size == 1) */
3247:       if (scall == MAT_REUSE_MATRIX) {
3248:         MatDestroy(&(*submat)[i]);
3249:       }
3250:       if (isrow_p[i] || iscol_p[i]) {
3251:         MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3252:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3253:         /* Otherwise A is extracted straight into (*submats)[i]. */
3254:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3255:         MatDestroy(A+i);
3256:       } else (*submat)[i] = A[i];
3257:     }
3258:     ISDestroy(&isrow_p[i]);
3259:     ISDestroy(&iscol_p[i]);
3260:   }
3261:   PetscFree2(cisrow,ciscol);
3262:   PetscFree2(isrow_p,iscol_p);
3263:   PetscFree(ciscol_p);
3264:   PetscFree(A);
3265:   MatDestroySubMatrices(cismax,&B);
3266:   return(0);
3267: }

3269: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3270: {

3274:   MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3275:   return(0);
3276: }