Actual source code: sbaijfact.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscis.h>
7: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneg,PetscInt *nzero,PetscInt *npos)
8: {
9: Mat_SeqSBAIJ *fact=(Mat_SeqSBAIJ*)F->data;
10: MatScalar *dd=fact->a;
11: PetscInt mbs=fact->mbs,bs=F->rmap->bs,i,nneg_tmp,npos_tmp,*fi=fact->diag;
14: if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
16: nneg_tmp = 0; npos_tmp = 0;
17: for (i=0; i<mbs; i++) {
18: if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
19: else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
20: fi++;
21: }
22: if (nneg) *nneg = nneg_tmp;
23: if (npos) *npos = npos_tmp;
24: if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
25: return(0);
26: }
28: /*
29: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
30: Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
31: */
32: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
33: {
34: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
36: const PetscInt *rip,*ai,*aj;
37: PetscInt i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
38: PetscInt m,reallocs = 0,prow;
39: PetscInt *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
40: PetscReal f = info->fill;
41: PetscBool perm_identity;
44: /* check whether perm is the identity mapping */
45: ISIdentity(perm,&perm_identity);
46: ISGetIndices(perm,&rip);
48: if (perm_identity) { /* without permutation */
49: a->permute = PETSC_FALSE;
51: ai = a->i; aj = a->j;
52: } else { /* non-trivial permutation */
53: a->permute = PETSC_TRUE;
55: MatReorderingSeqSBAIJ(A,perm);
57: ai = a->inew; aj = a->jnew;
58: }
60: /* initialization */
61: PetscMalloc1(mbs+1,&iu);
62: umax = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
63: PetscMalloc1(umax,&ju);
64: iu[0] = mbs+1;
65: juidx = mbs + 1; /* index for ju */
66: /* jl linked list for pivot row -- linked list for col index */
67: PetscMalloc2(mbs,&jl,mbs,&q);
68: for (i=0; i<mbs; i++) {
69: jl[i] = mbs;
70: q[i] = 0;
71: }
73: /* for each row k */
74: for (k=0; k<mbs; k++) {
75: for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */
76: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
77: q[k] = mbs;
78: /* initialize nonzero structure of k-th row to row rip[k] of A */
79: jmin = ai[rip[k]] +1; /* exclude diag[k] */
80: jmax = ai[rip[k]+1];
81: for (j=jmin; j<jmax; j++) {
82: vj = rip[aj[j]]; /* col. value */
83: if (vj > k) {
84: qm = k;
85: do {
86: m = qm; qm = q[m];
87: } while (qm < vj);
88: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
89: nzk++;
90: q[m] = vj;
91: q[vj] = qm;
92: } /* if (vj > k) */
93: } /* for (j=jmin; j<jmax; j++) */
95: /* modify nonzero structure of k-th row by computing fill-in
96: for each row i to be merged in */
97: prow = k;
98: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
100: while (prow < k) {
101: /* merge row prow into k-th row */
102: jmin = iu[prow] + 1; jmax = iu[prow+1];
103: qm = k;
104: for (j=jmin; j<jmax; j++) {
105: vj = ju[j];
106: do {
107: m = qm; qm = q[m];
108: } while (qm < vj);
109: if (qm != vj) {
110: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
111: }
112: }
113: prow = jl[prow]; /* next pivot row */
114: }
116: /* add k to row list for first nonzero element in k-th row */
117: if (nzk > 0) {
118: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
119: jl[k] = jl[i]; jl[i] = k;
120: }
121: iu[k+1] = iu[k] + nzk;
123: /* allocate more space to ju if needed */
124: if (iu[k+1] > umax) {
125: /* estimate how much additional space we will need */
126: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
127: /* just double the memory each time */
128: maxadd = umax;
129: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
130: umax += maxadd;
132: /* allocate a longer ju */
133: PetscMalloc1(umax,&jutmp);
134: PetscArraycpy(jutmp,ju,iu[k]);
135: PetscFree(ju);
136: ju = jutmp;
137: reallocs++; /* count how many times we realloc */
138: }
140: /* save nonzero structure of k-th row in ju */
141: i=k;
142: while (nzk--) {
143: i = q[i];
144: ju[juidx++] = i;
145: }
146: }
148: #if defined(PETSC_USE_INFO)
149: if (ai[mbs] != 0) {
150: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
151: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
152: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
153: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
154: PetscInfo(A,"for best performance.\n");
155: } else {
156: PetscInfo(A,"Empty matrix\n");
157: }
158: #endif
160: ISRestoreIndices(perm,&rip);
161: PetscFree2(jl,q);
163: /* put together the new matrix */
164: MatSeqSBAIJSetPreallocation(F,bs,MAT_SKIP_ALLOCATION,NULL);
166: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
167: b = (Mat_SeqSBAIJ*)(F)->data;
168: b->singlemalloc = PETSC_FALSE;
169: b->free_a = PETSC_TRUE;
170: b->free_ij = PETSC_TRUE;
172: PetscMalloc1((iu[mbs]+1)*bs2,&b->a);
173: b->j = ju;
174: b->i = iu;
175: b->diag = NULL;
176: b->ilen = NULL;
177: b->imax = NULL;
178: b->row = perm;
180: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
182: PetscObjectReference((PetscObject)perm);
184: b->icol = perm;
185: PetscObjectReference((PetscObject)perm);
186: PetscMalloc1(bs*mbs+bs,&b->solve_work);
187: /* In b structure: Free imax, ilen, old a, old j.
188: Allocate idnew, solve_work, new a, new j */
189: PetscLogObjectMemory((PetscObject)F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
190: b->maxnz = b->nz = iu[mbs];
192: (F)->info.factor_mallocs = reallocs;
193: (F)->info.fill_ratio_given = f;
194: if (ai[mbs] != 0) {
195: (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
196: } else {
197: (F)->info.fill_ratio_needed = 0.0;
198: }
199: MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);
200: return(0);
201: }
202: /*
203: Symbolic U^T*D*U factorization for SBAIJ format.
204: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
205: */
206: #include <petscbt.h>
207: #include <../src/mat/utils/freespace.h>
208: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
209: {
210: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
211: Mat_SeqSBAIJ *b;
212: PetscErrorCode ierr;
213: PetscBool perm_identity,missing;
214: PetscReal fill = info->fill;
215: const PetscInt *rip,*ai=a->i,*aj=a->j;
216: PetscInt i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow;
217: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
218: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
219: PetscFreeSpaceList free_space=NULL,current_space=NULL;
220: PetscBT lnkbt;
223: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
224: MatMissingDiagonal(A,&missing,&i);
225: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
226: if (bs > 1) {
227: MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
228: return(0);
229: }
231: /* check whether perm is the identity mapping */
232: ISIdentity(perm,&perm_identity);
233: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
234: a->permute = PETSC_FALSE;
235: ISGetIndices(perm,&rip);
237: /* initialization */
238: PetscMalloc1(mbs+1,&ui);
239: PetscMalloc1(mbs+1,&udiag);
240: ui[0] = 0;
242: /* jl: linked list for storing indices of the pivot rows
243: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
244: PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);
245: for (i=0; i<mbs; i++) {
246: jl[i] = mbs; il[i] = 0;
247: }
249: /* create and initialize a linked list for storing column indices of the active row k */
250: nlnk = mbs + 1;
251: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
253: /* initial FreeSpace size is fill*(ai[mbs]+1) */
254: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);
255: current_space = free_space;
257: for (k=0; k<mbs; k++) { /* for each active row k */
258: /* initialize lnk by the column indices of row rip[k] of A */
259: nzk = 0;
260: ncols = ai[k+1] - ai[k];
261: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
262: for (j=0; j<ncols; j++) {
263: i = *(aj + ai[k] + j);
264: cols[j] = i;
265: }
266: PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);
267: nzk += nlnk;
269: /* update lnk by computing fill-in for each pivot row to be merged in */
270: prow = jl[k]; /* 1st pivot row */
272: while (prow < k) {
273: nextprow = jl[prow];
274: /* merge prow into k-th row */
275: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
276: jmax = ui[prow+1];
277: ncols = jmax-jmin;
278: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
279: PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);
280: nzk += nlnk;
282: /* update il and jl for prow */
283: if (jmin < jmax) {
284: il[prow] = jmin;
285: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
286: }
287: prow = nextprow;
288: }
290: /* if free space is not available, make more free space */
291: if (current_space->local_remaining<nzk) {
292: i = mbs - k + 1; /* num of unfactored rows */
293: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
294: PetscFreeSpaceGet(i,¤t_space);
295: reallocs++;
296: }
298: /* copy data into free space, then initialize lnk */
299: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
301: /* add the k-th row into il and jl */
302: if (nzk > 1) {
303: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
304: jl[k] = jl[i]; jl[i] = k;
305: il[k] = ui[k] + 1;
306: }
307: ui_ptr[k] = current_space->array;
309: current_space->array += nzk;
310: current_space->local_used += nzk;
311: current_space->local_remaining -= nzk;
313: ui[k+1] = ui[k] + nzk;
314: }
316: ISRestoreIndices(perm,&rip);
317: PetscFree4(ui_ptr,il,jl,cols);
319: /* destroy list of free space and other temporary array(s) */
320: PetscMalloc1(ui[mbs]+1,&uj);
321: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag); /* store matrix factor */
322: PetscLLDestroy(lnk,lnkbt);
324: /* put together the new matrix in MATSEQSBAIJ format */
325: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
327: b = (Mat_SeqSBAIJ*)fact->data;
328: b->singlemalloc = PETSC_FALSE;
329: b->free_a = PETSC_TRUE;
330: b->free_ij = PETSC_TRUE;
332: PetscMalloc1(ui[mbs]+1,&b->a);
334: b->j = uj;
335: b->i = ui;
336: b->diag = udiag;
337: b->free_diag = PETSC_TRUE;
338: b->ilen = NULL;
339: b->imax = NULL;
340: b->row = perm;
341: b->icol = perm;
343: PetscObjectReference((PetscObject)perm);
344: PetscObjectReference((PetscObject)perm);
346: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
348: PetscMalloc1(mbs+1,&b->solve_work);
349: PetscLogObjectMemory((PetscObject)fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));
351: b->maxnz = b->nz = ui[mbs];
353: fact->info.factor_mallocs = reallocs;
354: fact->info.fill_ratio_given = fill;
355: if (ai[mbs] != 0) {
356: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
357: } else {
358: fact->info.fill_ratio_needed = 0.0;
359: }
360: #if defined(PETSC_USE_INFO)
361: if (ai[mbs] != 0) {
362: PetscReal af = fact->info.fill_ratio_needed;
363: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
364: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
365: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
366: } else {
367: PetscInfo(A,"Empty matrix\n");
368: }
369: #endif
370: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
371: return(0);
372: }
374: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
375: {
376: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
377: Mat_SeqSBAIJ *b;
378: PetscErrorCode ierr;
379: PetscBool perm_identity,missing;
380: PetscReal fill = info->fill;
381: const PetscInt *rip,*ai,*aj;
382: PetscInt i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
383: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
384: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
385: PetscFreeSpaceList free_space=NULL,current_space=NULL;
386: PetscBT lnkbt;
389: MatMissingDiagonal(A,&missing,&d);
390: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
392: /*
393: This code originally uses Modified Sparse Row (MSR) storage
394: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
395: Then it is rewritten so the factor B takes seqsbaij format. However the associated
396: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
397: thus the original code in MSR format is still used for these cases.
398: The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
399: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
400: */
401: if (bs > 1) {
402: MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
403: return(0);
404: }
406: /* check whether perm is the identity mapping */
407: ISIdentity(perm,&perm_identity);
409: if (perm_identity) {
410: a->permute = PETSC_FALSE;
412: ai = a->i; aj = a->j;
413: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
414: ISGetIndices(perm,&rip);
416: /* initialization */
417: PetscMalloc1(mbs+1,&ui);
418: ui[0] = 0;
420: /* jl: linked list for storing indices of the pivot rows
421: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
422: PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);
423: for (i=0; i<mbs; i++) {
424: jl[i] = mbs; il[i] = 0;
425: }
427: /* create and initialize a linked list for storing column indices of the active row k */
428: nlnk = mbs + 1;
429: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
431: /* initial FreeSpace size is fill*(ai[mbs]+1) */
432: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);
433: current_space = free_space;
435: for (k=0; k<mbs; k++) { /* for each active row k */
436: /* initialize lnk by the column indices of row rip[k] of A */
437: nzk = 0;
438: ncols = ai[rip[k]+1] - ai[rip[k]];
439: for (j=0; j<ncols; j++) {
440: i = *(aj + ai[rip[k]] + j);
441: cols[j] = rip[i];
442: }
443: PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);
444: nzk += nlnk;
446: /* update lnk by computing fill-in for each pivot row to be merged in */
447: prow = jl[k]; /* 1st pivot row */
449: while (prow < k) {
450: nextprow = jl[prow];
451: /* merge prow into k-th row */
452: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
453: jmax = ui[prow+1];
454: ncols = jmax-jmin;
455: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
456: PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);
457: nzk += nlnk;
459: /* update il and jl for prow */
460: if (jmin < jmax) {
461: il[prow] = jmin;
463: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
464: }
465: prow = nextprow;
466: }
468: /* if free space is not available, make more free space */
469: if (current_space->local_remaining<nzk) {
470: i = mbs - k + 1; /* num of unfactored rows */
471: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
472: PetscFreeSpaceGet(i,¤t_space);
473: reallocs++;
474: }
476: /* copy data into free space, then initialize lnk */
477: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
479: /* add the k-th row into il and jl */
480: if (nzk-1 > 0) {
481: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
482: jl[k] = jl[i]; jl[i] = k;
483: il[k] = ui[k] + 1;
484: }
485: ui_ptr[k] = current_space->array;
487: current_space->array += nzk;
488: current_space->local_used += nzk;
489: current_space->local_remaining -= nzk;
491: ui[k+1] = ui[k] + nzk;
492: }
494: ISRestoreIndices(perm,&rip);
495: PetscFree4(ui_ptr,il,jl,cols);
497: /* destroy list of free space and other temporary array(s) */
498: PetscMalloc1(ui[mbs]+1,&uj);
499: PetscFreeSpaceContiguous(&free_space,uj);
500: PetscLLDestroy(lnk,lnkbt);
502: /* put together the new matrix in MATSEQSBAIJ format */
503: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
505: b = (Mat_SeqSBAIJ*)fact->data;
506: b->singlemalloc = PETSC_FALSE;
507: b->free_a = PETSC_TRUE;
508: b->free_ij = PETSC_TRUE;
510: PetscMalloc1(ui[mbs]+1,&b->a);
512: b->j = uj;
513: b->i = ui;
514: b->diag = NULL;
515: b->ilen = NULL;
516: b->imax = NULL;
517: b->row = perm;
519: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
521: PetscObjectReference((PetscObject)perm);
522: b->icol = perm;
523: PetscObjectReference((PetscObject)perm);
524: PetscMalloc1(mbs+1,&b->solve_work);
525: PetscLogObjectMemory((PetscObject)fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
526: b->maxnz = b->nz = ui[mbs];
528: fact->info.factor_mallocs = reallocs;
529: fact->info.fill_ratio_given = fill;
530: if (ai[mbs] != 0) {
531: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
532: } else {
533: fact->info.fill_ratio_needed = 0.0;
534: }
535: #if defined(PETSC_USE_INFO)
536: if (ai[mbs] != 0) {
537: PetscReal af = fact->info.fill_ratio_needed;
538: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
539: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
540: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
541: } else {
542: PetscInfo(A,"Empty matrix\n");
543: }
544: #endif
545: MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);
546: return(0);
547: }
549: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
550: {
551: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
552: IS perm = b->row;
554: const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
555: PetscInt i,j;
556: PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
557: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
558: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
559: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
560: MatScalar *work;
561: PetscInt *pivots;
562: PetscBool allowzeropivot,zeropivotdetected;
565: /* initialization */
566: PetscCalloc1(bs2*mbs,&rtmp);
567: PetscMalloc2(mbs,&il,mbs,&jl);
568: allowzeropivot = PetscNot(A->erroriffailure);
570: il[0] = 0;
571: for (i=0; i<mbs; i++) jl[i] = mbs;
573: PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);
574: PetscMalloc1(bs,&pivots);
576: ISGetIndices(perm,&perm_ptr);
578: /* check permutation */
579: if (!a->permute) {
580: ai = a->i; aj = a->j; aa = a->a;
581: } else {
582: ai = a->inew; aj = a->jnew;
583: PetscMalloc1(bs2*ai[mbs],&aa);
584: PetscArraycpy(aa,a->a,bs2*ai[mbs]);
585: PetscMalloc1(ai[mbs],&a2anew);
586: PetscArraycpy(a2anew,a->a2anew,ai[mbs]);
588: for (i=0; i<mbs; i++) {
589: jmin = ai[i]; jmax = ai[i+1];
590: for (j=jmin; j<jmax; j++) {
591: while (a2anew[j] != j) {
592: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
593: for (k1=0; k1<bs2; k1++) {
594: dk[k1] = aa[k*bs2+k1];
595: aa[k*bs2+k1] = aa[j*bs2+k1];
596: aa[j*bs2+k1] = dk[k1];
597: }
598: }
599: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
600: if (i > aj[j]) {
601: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
602: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
603: for (k=0; k<bs; k++) { /* j-th block of aa <- dk^T */
604: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
605: }
606: }
607: }
608: }
609: PetscFree(a2anew);
610: }
612: /* for each row k */
613: for (k = 0; k<mbs; k++) {
615: /*initialize k-th row with elements nonzero in row perm(k) of A */
616: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
618: ap = aa + jmin*bs2;
619: for (j = jmin; j < jmax; j++) {
620: vj = perm_ptr[aj[j]]; /* block col. index */
621: rtmp_ptr = rtmp + vj*bs2;
622: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
623: }
625: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
626: PetscArraycpy(dk,rtmp+k*bs2,bs2);
627: i = jl[k]; /* first row to be added to k_th row */
629: while (i < k) {
630: nexti = jl[i]; /* next row to be added to k_th row */
632: /* compute multiplier */
633: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
635: /* uik = -inv(Di)*U_bar(i,k) */
636: diag = ba + i*bs2;
637: u = ba + ili*bs2;
638: PetscArrayzero(uik,bs2);
639: PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
641: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
642: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
643: PetscLogFlops(4.0*bs*bs2);
645: /* update -U(i,k) */
646: PetscArraycpy(ba+ili*bs2,uik,bs2);
648: /* add multiple of row i to k-th row ... */
649: jmin = ili + 1; jmax = bi[i+1];
650: if (jmin < jmax) {
651: for (j=jmin; j<jmax; j++) {
652: /* rtmp += -U(i,k)^T * U_bar(i,j) */
653: rtmp_ptr = rtmp + bj[j]*bs2;
654: u = ba + j*bs2;
655: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
656: }
657: PetscLogFlops(2.0*bs*bs2*(jmax-jmin));
659: /* ... add i to row list for next nonzero entry */
660: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
661: j = bj[jmin];
662: jl[i] = jl[j]; jl[j] = i; /* update jl */
663: }
664: i = nexti;
665: }
667: /* save nonzero entries in k-th row of U ... */
669: /* invert diagonal block */
670: diag = ba+k*bs2;
671: PetscArraycpy(diag,dk,bs2);
673: PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);
674: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
676: jmin = bi[k]; jmax = bi[k+1];
677: if (jmin < jmax) {
678: for (j=jmin; j<jmax; j++) {
679: vj = bj[j]; /* block col. index of U */
680: u = ba + j*bs2;
681: rtmp_ptr = rtmp + vj*bs2;
682: for (k1=0; k1<bs2; k1++) {
683: *u++ = *rtmp_ptr;
684: *rtmp_ptr++ = 0.0;
685: }
686: }
688: /* ... add k to row list for first nonzero entry in k-th row */
689: il[k] = jmin;
690: i = bj[jmin];
691: jl[k] = jl[i]; jl[i] = k;
692: }
693: }
695: PetscFree(rtmp);
696: PetscFree2(il,jl);
697: PetscFree3(dk,uik,work);
698: PetscFree(pivots);
699: if (a->permute) {
700: PetscFree(aa);
701: }
703: ISRestoreIndices(perm,&perm_ptr);
705: C->ops->solve = MatSolve_SeqSBAIJ_N_inplace;
706: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
707: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_inplace;
708: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_inplace;
710: C->assembled = PETSC_TRUE;
711: C->preallocated = PETSC_TRUE;
713: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
714: return(0);
715: }
717: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
718: {
719: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
721: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
722: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
723: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
724: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
725: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
726: MatScalar *work;
727: PetscInt *pivots;
728: PetscBool allowzeropivot,zeropivotdetected;
731: PetscCalloc1(bs2*mbs,&rtmp);
732: PetscMalloc2(mbs,&il,mbs,&jl);
733: il[0] = 0;
734: for (i=0; i<mbs; i++) jl[i] = mbs;
736: PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);
737: PetscMalloc1(bs,&pivots);
738: allowzeropivot = PetscNot(A->erroriffailure);
740: ai = a->i; aj = a->j; aa = a->a;
742: /* for each row k */
743: for (k = 0; k<mbs; k++) {
745: /*initialize k-th row with elements nonzero in row k of A */
746: jmin = ai[k]; jmax = ai[k+1];
747: ap = aa + jmin*bs2;
748: for (j = jmin; j < jmax; j++) {
749: vj = aj[j]; /* block col. index */
750: rtmp_ptr = rtmp + vj*bs2;
751: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
752: }
754: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
755: PetscArraycpy(dk,rtmp+k*bs2,bs2);
756: i = jl[k]; /* first row to be added to k_th row */
758: while (i < k) {
759: nexti = jl[i]; /* next row to be added to k_th row */
761: /* compute multiplier */
762: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
764: /* uik = -inv(Di)*U_bar(i,k) */
765: diag = ba + i*bs2;
766: u = ba + ili*bs2;
767: PetscArrayzero(uik,bs2);
768: PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
770: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
771: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
772: PetscLogFlops(2.0*bs*bs2);
774: /* update -U(i,k) */
775: PetscArraycpy(ba+ili*bs2,uik,bs2);
777: /* add multiple of row i to k-th row ... */
778: jmin = ili + 1; jmax = bi[i+1];
779: if (jmin < jmax) {
780: for (j=jmin; j<jmax; j++) {
781: /* rtmp += -U(i,k)^T * U_bar(i,j) */
782: rtmp_ptr = rtmp + bj[j]*bs2;
783: u = ba + j*bs2;
784: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
785: }
786: PetscLogFlops(2.0*bs*bs2*(jmax-jmin));
788: /* ... add i to row list for next nonzero entry */
789: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
790: j = bj[jmin];
791: jl[i] = jl[j]; jl[j] = i; /* update jl */
792: }
793: i = nexti;
794: }
796: /* save nonzero entries in k-th row of U ... */
798: /* invert diagonal block */
799: diag = ba+k*bs2;
800: PetscArraycpy(diag,dk,bs2);
802: PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);
803: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
805: jmin = bi[k]; jmax = bi[k+1];
806: if (jmin < jmax) {
807: for (j=jmin; j<jmax; j++) {
808: vj = bj[j]; /* block col. index of U */
809: u = ba + j*bs2;
810: rtmp_ptr = rtmp + vj*bs2;
811: for (k1=0; k1<bs2; k1++) {
812: *u++ = *rtmp_ptr;
813: *rtmp_ptr++ = 0.0;
814: }
815: }
817: /* ... add k to row list for first nonzero entry in k-th row */
818: il[k] = jmin;
819: i = bj[jmin];
820: jl[k] = jl[i]; jl[i] = k;
821: }
822: }
824: PetscFree(rtmp);
825: PetscFree2(il,jl);
826: PetscFree3(dk,uik,work);
827: PetscFree(pivots);
829: C->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
830: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
831: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
832: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
833: C->assembled = PETSC_TRUE;
834: C->preallocated = PETSC_TRUE;
836: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
837: return(0);
838: }
840: /*
841: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
842: Version for blocks 2 by 2.
843: */
844: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
845: {
846: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
847: IS perm = b->row;
849: const PetscInt *ai,*aj,*perm_ptr;
850: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
851: PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
852: MatScalar *ba = b->a,*aa,*ap;
853: MatScalar *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
854: PetscReal shift = info->shiftamount;
855: PetscBool allowzeropivot,zeropivotdetected;
858: allowzeropivot = PetscNot(A->erroriffailure);
860: /* initialization */
861: /* il and jl record the first nonzero element in each row of the accessing
862: window U(0:k, k:mbs-1).
863: jl: list of rows to be added to uneliminated rows
864: i>= k: jl(i) is the first row to be added to row i
865: i< k: jl(i) is the row following row i in some list of rows
866: jl(i) = mbs indicates the end of a list
867: il(i): points to the first nonzero element in columns k,...,mbs-1 of
868: row i of U */
869: PetscCalloc1(4*mbs,&rtmp);
870: PetscMalloc2(mbs,&il,mbs,&jl);
871: il[0] = 0;
872: for (i=0; i<mbs; i++) jl[i] = mbs;
874: ISGetIndices(perm,&perm_ptr);
876: /* check permutation */
877: if (!a->permute) {
878: ai = a->i; aj = a->j; aa = a->a;
879: } else {
880: ai = a->inew; aj = a->jnew;
881: PetscMalloc1(4*ai[mbs],&aa);
882: PetscArraycpy(aa,a->a,4*ai[mbs]);
883: PetscMalloc1(ai[mbs],&a2anew);
884: PetscArraycpy(a2anew,a->a2anew,ai[mbs]);
886: for (i=0; i<mbs; i++) {
887: jmin = ai[i]; jmax = ai[i+1];
888: for (j=jmin; j<jmax; j++) {
889: while (a2anew[j] != j) {
890: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
891: for (k1=0; k1<4; k1++) {
892: dk[k1] = aa[k*4+k1];
893: aa[k*4+k1] = aa[j*4+k1];
894: aa[j*4+k1] = dk[k1];
895: }
896: }
897: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
898: if (i > aj[j]) {
899: ap = aa + j*4; /* ptr to the beginning of the block */
900: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
901: ap[1] = ap[2];
902: ap[2] = dk[1];
903: }
904: }
905: }
906: PetscFree(a2anew);
907: }
909: /* for each row k */
910: for (k = 0; k<mbs; k++) {
912: /*initialize k-th row with elements nonzero in row perm(k) of A */
913: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
914: ap = aa + jmin*4;
915: for (j = jmin; j < jmax; j++) {
916: vj = perm_ptr[aj[j]]; /* block col. index */
917: rtmp_ptr = rtmp + vj*4;
918: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
919: }
921: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
922: PetscArraycpy(dk,rtmp+k*4,4);
923: i = jl[k]; /* first row to be added to k_th row */
925: while (i < k) {
926: nexti = jl[i]; /* next row to be added to k_th row */
928: /* compute multiplier */
929: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
931: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
932: diag = ba + i*4;
933: u = ba + ili*4;
934: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
935: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
936: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
937: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
939: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
940: dk[0] += uik[0]*u[0] + uik[1]*u[1];
941: dk[1] += uik[2]*u[0] + uik[3]*u[1];
942: dk[2] += uik[0]*u[2] + uik[1]*u[3];
943: dk[3] += uik[2]*u[2] + uik[3]*u[3];
945: PetscLogFlops(16.0*2.0);
947: /* update -U(i,k): ba[ili] = uik */
948: PetscArraycpy(ba+ili*4,uik,4);
950: /* add multiple of row i to k-th row ... */
951: jmin = ili + 1; jmax = bi[i+1];
952: if (jmin < jmax) {
953: for (j=jmin; j<jmax; j++) {
954: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
955: rtmp_ptr = rtmp + bj[j]*4;
956: u = ba + j*4;
957: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
958: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
959: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
960: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
961: }
962: PetscLogFlops(16.0*(jmax-jmin));
964: /* ... add i to row list for next nonzero entry */
965: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
966: j = bj[jmin];
967: jl[i] = jl[j]; jl[j] = i; /* update jl */
968: }
969: i = nexti;
970: }
972: /* save nonzero entries in k-th row of U ... */
974: /* invert diagonal block */
975: diag = ba+k*4;
976: PetscArraycpy(diag,dk,4);
977: PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);
978: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
980: jmin = bi[k]; jmax = bi[k+1];
981: if (jmin < jmax) {
982: for (j=jmin; j<jmax; j++) {
983: vj = bj[j]; /* block col. index of U */
984: u = ba + j*4;
985: rtmp_ptr = rtmp + vj*4;
986: for (k1=0; k1<4; k1++) {
987: *u++ = *rtmp_ptr;
988: *rtmp_ptr++ = 0.0;
989: }
990: }
992: /* ... add k to row list for first nonzero entry in k-th row */
993: il[k] = jmin;
994: i = bj[jmin];
995: jl[k] = jl[i]; jl[i] = k;
996: }
997: }
999: PetscFree(rtmp);
1000: PetscFree2(il,jl);
1001: if (a->permute) {
1002: PetscFree(aa);
1003: }
1004: ISRestoreIndices(perm,&perm_ptr);
1006: C->ops->solve = MatSolve_SeqSBAIJ_2_inplace;
1007: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1008: C->assembled = PETSC_TRUE;
1009: C->preallocated = PETSC_TRUE;
1011: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1012: return(0);
1013: }
1015: /*
1016: Version for when blocks are 2 by 2 Using natural ordering
1017: */
1018: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1019: {
1020: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
1022: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1023: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1024: MatScalar *ba = b->a,*aa,*ap,dk[8],uik[8];
1025: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
1026: PetscReal shift = info->shiftamount;
1027: PetscBool allowzeropivot,zeropivotdetected;
1030: allowzeropivot = PetscNot(A->erroriffailure);
1032: /* initialization */
1033: /* il and jl record the first nonzero element in each row of the accessing
1034: window U(0:k, k:mbs-1).
1035: jl: list of rows to be added to uneliminated rows
1036: i>= k: jl(i) is the first row to be added to row i
1037: i< k: jl(i) is the row following row i in some list of rows
1038: jl(i) = mbs indicates the end of a list
1039: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1040: row i of U */
1041: PetscCalloc1(4*mbs,&rtmp);
1042: PetscMalloc2(mbs,&il,mbs,&jl);
1043: il[0] = 0;
1044: for (i=0; i<mbs; i++) jl[i] = mbs;
1046: ai = a->i; aj = a->j; aa = a->a;
1048: /* for each row k */
1049: for (k = 0; k<mbs; k++) {
1051: /*initialize k-th row with elements nonzero in row k of A */
1052: jmin = ai[k]; jmax = ai[k+1];
1053: ap = aa + jmin*4;
1054: for (j = jmin; j < jmax; j++) {
1055: vj = aj[j]; /* block col. index */
1056: rtmp_ptr = rtmp + vj*4;
1057: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1058: }
1060: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1061: PetscArraycpy(dk,rtmp+k*4,4);
1062: i = jl[k]; /* first row to be added to k_th row */
1064: while (i < k) {
1065: nexti = jl[i]; /* next row to be added to k_th row */
1067: /* compute multiplier */
1068: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1070: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1071: diag = ba + i*4;
1072: u = ba + ili*4;
1073: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1074: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1075: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1076: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1078: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1079: dk[0] += uik[0]*u[0] + uik[1]*u[1];
1080: dk[1] += uik[2]*u[0] + uik[3]*u[1];
1081: dk[2] += uik[0]*u[2] + uik[1]*u[3];
1082: dk[3] += uik[2]*u[2] + uik[3]*u[3];
1084: PetscLogFlops(16.0*2.0);
1086: /* update -U(i,k): ba[ili] = uik */
1087: PetscArraycpy(ba+ili*4,uik,4);
1089: /* add multiple of row i to k-th row ... */
1090: jmin = ili + 1; jmax = bi[i+1];
1091: if (jmin < jmax) {
1092: for (j=jmin; j<jmax; j++) {
1093: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1094: rtmp_ptr = rtmp + bj[j]*4;
1095: u = ba + j*4;
1096: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1097: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1098: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1099: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1100: }
1101: PetscLogFlops(16.0*(jmax-jmin));
1103: /* ... add i to row list for next nonzero entry */
1104: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1105: j = bj[jmin];
1106: jl[i] = jl[j]; jl[j] = i; /* update jl */
1107: }
1108: i = nexti;
1109: }
1111: /* save nonzero entries in k-th row of U ... */
1113: /* invert diagonal block */
1114: diag = ba+k*4;
1115: PetscArraycpy(diag,dk,4);
1116: PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);
1117: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1119: jmin = bi[k]; jmax = bi[k+1];
1120: if (jmin < jmax) {
1121: for (j=jmin; j<jmax; j++) {
1122: vj = bj[j]; /* block col. index of U */
1123: u = ba + j*4;
1124: rtmp_ptr = rtmp + vj*4;
1125: for (k1=0; k1<4; k1++) {
1126: *u++ = *rtmp_ptr;
1127: *rtmp_ptr++ = 0.0;
1128: }
1129: }
1131: /* ... add k to row list for first nonzero entry in k-th row */
1132: il[k] = jmin;
1133: i = bj[jmin];
1134: jl[k] = jl[i]; jl[i] = k;
1135: }
1136: }
1138: PetscFree(rtmp);
1139: PetscFree2(il,jl);
1141: C->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1142: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1143: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1144: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1145: C->assembled = PETSC_TRUE;
1146: C->preallocated = PETSC_TRUE;
1148: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1149: return(0);
1150: }
1152: /*
1153: Numeric U^T*D*U factorization for SBAIJ format.
1154: Version for blocks are 1 by 1.
1155: */
1156: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1157: {
1158: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1159: IS ip=b->row;
1161: const PetscInt *ai,*aj,*rip;
1162: PetscInt *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1163: PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1164: MatScalar *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1165: PetscReal rs;
1166: FactorShiftCtx sctx;
1169: /* MatPivotSetUp(): initialize shift context sctx */
1170: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1172: ISGetIndices(ip,&rip);
1173: if (!a->permute) {
1174: ai = a->i; aj = a->j; aa = a->a;
1175: } else {
1176: ai = a->inew; aj = a->jnew;
1177: nz = ai[mbs];
1178: PetscMalloc1(nz,&aa);
1179: a2anew = a->a2anew;
1180: bval = a->a;
1181: for (j=0; j<nz; j++) {
1182: aa[a2anew[j]] = *(bval++);
1183: }
1184: }
1186: /* initialization */
1187: /* il and jl record the first nonzero element in each row of the accessing
1188: window U(0:k, k:mbs-1).
1189: jl: list of rows to be added to uneliminated rows
1190: i>= k: jl(i) is the first row to be added to row i
1191: i< k: jl(i) is the row following row i in some list of rows
1192: jl(i) = mbs indicates the end of a list
1193: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1194: row i of U */
1195: PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&jl);
1197: do {
1198: sctx.newshift = PETSC_FALSE;
1199: il[0] = 0;
1200: for (i=0; i<mbs; i++) {
1201: rtmp[i] = 0.0; jl[i] = mbs;
1202: }
1204: for (k = 0; k<mbs; k++) {
1205: /*initialize k-th row by the perm[k]-th row of A */
1206: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1207: bval = ba + bi[k];
1208: for (j = jmin; j < jmax; j++) {
1209: col = rip[aj[j]];
1210: rtmp[col] = aa[j];
1211: *bval++ = 0.0; /* for in-place factorization */
1212: }
1214: /* shift the diagonal of the matrix */
1215: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1217: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1218: dk = rtmp[k];
1219: i = jl[k]; /* first row to be added to k_th row */
1221: while (i < k) {
1222: nexti = jl[i]; /* next row to be added to k_th row */
1224: /* compute multiplier, update diag(k) and U(i,k) */
1225: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1226: uikdi = -ba[ili]*ba[bi[i]]; /* diagonal(k) */
1227: dk += uikdi*ba[ili];
1228: ba[ili] = uikdi; /* -U(i,k) */
1230: /* add multiple of row i to k-th row */
1231: jmin = ili + 1; jmax = bi[i+1];
1232: if (jmin < jmax) {
1233: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1234: PetscLogFlops(2.0*(jmax-jmin));
1236: /* update il and jl for row i */
1237: il[i] = jmin;
1238: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1239: }
1240: i = nexti;
1241: }
1243: /* shift the diagonals when zero pivot is detected */
1244: /* compute rs=sum of abs(off-diagonal) */
1245: rs = 0.0;
1246: jmin = bi[k]+1;
1247: nz = bi[k+1] - jmin;
1248: if (nz) {
1249: bcol = bj + jmin;
1250: while (nz--) {
1251: rs += PetscAbsScalar(rtmp[*bcol]);
1252: bcol++;
1253: }
1254: }
1256: sctx.rs = rs;
1257: sctx.pv = dk;
1258: MatPivotCheck(C,A,info,&sctx,k);
1259: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1260: dk = sctx.pv;
1262: /* copy data into U(k,:) */
1263: ba[bi[k]] = 1.0/dk; /* U(k,k) */
1264: jmin = bi[k]+1; jmax = bi[k+1];
1265: if (jmin < jmax) {
1266: for (j=jmin; j<jmax; j++) {
1267: col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1268: }
1269: /* add the k-th row into il and jl */
1270: il[k] = jmin;
1271: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1272: }
1273: }
1274: } while (sctx.newshift);
1275: PetscFree3(rtmp,il,jl);
1276: if (a->permute) {PetscFree(aa);}
1278: ISRestoreIndices(ip,&rip);
1280: C->ops->solve = MatSolve_SeqSBAIJ_1_inplace;
1281: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1282: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1283: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_inplace;
1284: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_inplace;
1285: C->assembled = PETSC_TRUE;
1286: C->preallocated = PETSC_TRUE;
1288: PetscLogFlops(C->rmap->N);
1289: if (sctx.nshift) {
1290: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1291: PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1292: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1293: PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1294: }
1295: }
1296: return(0);
1297: }
1299: /*
1300: Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1301: Modified from MatCholeskyFactorNumeric_SeqAIJ()
1302: */
1303: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1304: {
1305: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1306: Mat_SeqSBAIJ *b=(Mat_SeqSBAIJ*)B->data;
1308: PetscInt i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1309: PetscInt *ai=a->i,*aj=a->j,*ajtmp;
1310: PetscInt k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1311: MatScalar *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1312: FactorShiftCtx sctx;
1313: PetscReal rs;
1314: MatScalar d,*v;
1317: PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&c2r);
1319: /* MatPivotSetUp(): initialize shift context sctx */
1320: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1322: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1323: sctx.shift_top = info->zeropivot;
1325: PetscArrayzero(rtmp,mbs);
1327: for (i=0; i<mbs; i++) {
1328: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1329: d = (aa)[a->diag[i]];
1330: rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1331: ajtmp = aj + ai[i] + 1; /* exclude diagonal */
1332: v = aa + ai[i] + 1;
1333: nz = ai[i+1] - ai[i] - 1;
1334: for (j=0; j<nz; j++) {
1335: rtmp[i] += PetscAbsScalar(v[j]);
1336: rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1337: }
1338: if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1339: }
1340: sctx.shift_top *= 1.1;
1341: sctx.nshift_max = 5;
1342: sctx.shift_lo = 0.;
1343: sctx.shift_hi = 1.;
1344: }
1346: /* allocate working arrays
1347: c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1348: il: for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1349: */
1350: do {
1351: sctx.newshift = PETSC_FALSE;
1353: for (i=0; i<mbs; i++) c2r[i] = mbs;
1354: if (mbs) il[0] = 0;
1356: for (k = 0; k<mbs; k++) {
1357: /* zero rtmp */
1358: nz = bi[k+1] - bi[k];
1359: bjtmp = bj + bi[k];
1360: for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1362: /* load in initial unfactored row */
1363: bval = ba + bi[k];
1364: jmin = ai[k]; jmax = ai[k+1];
1365: for (j = jmin; j < jmax; j++) {
1366: col = aj[j];
1367: rtmp[col] = aa[j];
1368: *bval++ = 0.0; /* for in-place factorization */
1369: }
1370: /* shift the diagonal of the matrix: ZeropivotApply() */
1371: rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1373: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1374: dk = rtmp[k];
1375: i = c2r[k]; /* first row to be added to k_th row */
1377: while (i < k) {
1378: nexti = c2r[i]; /* next row to be added to k_th row */
1380: /* compute multiplier, update diag(k) and U(i,k) */
1381: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1382: uikdi = -ba[ili]*ba[bdiag[i]]; /* diagonal(k) */
1383: dk += uikdi*ba[ili]; /* update diag[k] */
1384: ba[ili] = uikdi; /* -U(i,k) */
1386: /* add multiple of row i to k-th row */
1387: jmin = ili + 1; jmax = bi[i+1];
1388: if (jmin < jmax) {
1389: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1390: /* update il and c2r for row i */
1391: il[i] = jmin;
1392: j = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1393: }
1394: i = nexti;
1395: }
1397: /* copy data into U(k,:) */
1398: rs = 0.0;
1399: jmin = bi[k]; jmax = bi[k+1]-1;
1400: if (jmin < jmax) {
1401: for (j=jmin; j<jmax; j++) {
1402: col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1403: }
1404: /* add the k-th row into il and c2r */
1405: il[k] = jmin;
1406: i = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1407: }
1409: sctx.rs = rs;
1410: sctx.pv = dk;
1411: MatPivotCheck(B,A,info,&sctx,k);
1412: if (sctx.newshift) break;
1413: dk = sctx.pv;
1415: ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1416: }
1417: } while (sctx.newshift);
1419: PetscFree3(rtmp,il,c2r);
1421: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1422: B->ops->solves = MatSolves_SeqSBAIJ_1;
1423: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1424: B->ops->matsolve = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1425: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1426: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1428: B->assembled = PETSC_TRUE;
1429: B->preallocated = PETSC_TRUE;
1431: PetscLogFlops(B->rmap->n);
1433: /* MatPivotView() */
1434: if (sctx.nshift) {
1435: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1436: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1437: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1438: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1439: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1440: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1441: }
1442: }
1443: return(0);
1444: }
1446: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1447: {
1448: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1450: PetscInt i,j,mbs = a->mbs;
1451: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1452: PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1453: MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1454: PetscReal rs;
1455: FactorShiftCtx sctx;
1458: /* MatPivotSetUp(): initialize shift context sctx */
1459: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1461: /* initialization */
1462: /* il and jl record the first nonzero element in each row of the accessing
1463: window U(0:k, k:mbs-1).
1464: jl: list of rows to be added to uneliminated rows
1465: i>= k: jl(i) is the first row to be added to row i
1466: i< k: jl(i) is the row following row i in some list of rows
1467: jl(i) = mbs indicates the end of a list
1468: il(i): points to the first nonzero element in U(i,k:mbs-1)
1469: */
1470: PetscMalloc1(mbs,&rtmp);
1471: PetscMalloc2(mbs,&il,mbs,&jl);
1473: do {
1474: sctx.newshift = PETSC_FALSE;
1475: il[0] = 0;
1476: for (i=0; i<mbs; i++) {
1477: rtmp[i] = 0.0; jl[i] = mbs;
1478: }
1480: for (k = 0; k<mbs; k++) {
1481: /*initialize k-th row with elements nonzero in row perm(k) of A */
1482: nz = ai[k+1] - ai[k];
1483: acol = aj + ai[k];
1484: aval = aa + ai[k];
1485: bval = ba + bi[k];
1486: while (nz--) {
1487: rtmp[*acol++] = *aval++;
1488: *bval++ = 0.0; /* for in-place factorization */
1489: }
1491: /* shift the diagonal of the matrix */
1492: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1494: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1495: dk = rtmp[k];
1496: i = jl[k]; /* first row to be added to k_th row */
1498: while (i < k) {
1499: nexti = jl[i]; /* next row to be added to k_th row */
1500: /* compute multiplier, update D(k) and U(i,k) */
1501: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1502: uikdi = -ba[ili]*ba[bi[i]];
1503: dk += uikdi*ba[ili];
1504: ba[ili] = uikdi; /* -U(i,k) */
1506: /* add multiple of row i to k-th row ... */
1507: jmin = ili + 1;
1508: nz = bi[i+1] - jmin;
1509: if (nz > 0) {
1510: bcol = bj + jmin;
1511: bval = ba + jmin;
1512: PetscLogFlops(2.0*nz);
1513: while (nz--) rtmp[*bcol++] += uikdi*(*bval++);
1515: /* update il and jl for i-th row */
1516: il[i] = jmin;
1517: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1518: }
1519: i = nexti;
1520: }
1522: /* shift the diagonals when zero pivot is detected */
1523: /* compute rs=sum of abs(off-diagonal) */
1524: rs = 0.0;
1525: jmin = bi[k]+1;
1526: nz = bi[k+1] - jmin;
1527: if (nz) {
1528: bcol = bj + jmin;
1529: while (nz--) {
1530: rs += PetscAbsScalar(rtmp[*bcol]);
1531: bcol++;
1532: }
1533: }
1535: sctx.rs = rs;
1536: sctx.pv = dk;
1537: MatPivotCheck(C,A,info,&sctx,k);
1538: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1539: dk = sctx.pv;
1541: /* copy data into U(k,:) */
1542: ba[bi[k]] = 1.0/dk;
1543: jmin = bi[k]+1;
1544: nz = bi[k+1] - jmin;
1545: if (nz) {
1546: bcol = bj + jmin;
1547: bval = ba + jmin;
1548: while (nz--) {
1549: *bval++ = rtmp[*bcol];
1550: rtmp[*bcol++] = 0.0;
1551: }
1552: /* add k-th row into il and jl */
1553: il[k] = jmin;
1554: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1555: }
1556: } /* end of for (k = 0; k<mbs; k++) */
1557: } while (sctx.newshift);
1558: PetscFree(rtmp);
1559: PetscFree2(il,jl);
1561: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1562: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1563: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1564: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1565: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1567: C->assembled = PETSC_TRUE;
1568: C->preallocated = PETSC_TRUE;
1570: PetscLogFlops(C->rmap->N);
1571: if (sctx.nshift) {
1572: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1573: PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1574: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1575: PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1576: }
1577: }
1578: return(0);
1579: }
1581: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1582: {
1584: Mat C;
1587: MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);
1588: MatCholeskyFactorSymbolic(C,A,perm,info);
1589: MatCholeskyFactorNumeric(C,A,info);
1591: A->ops->solve = C->ops->solve;
1592: A->ops->solvetranspose = C->ops->solvetranspose;
1594: MatHeaderMerge(A,&C);
1595: return(0);
1596: }